From 4ec49da5f6a6a8e77cdb66b8f814718bf11fffef Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Fri, 28 Apr 2023 20:39:38 -0400 Subject: x86_64: implement a bunch of floating point stuff --- src/arch/x86_64/Encoding.zig | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index a977af7842..5cb7f7a2d9 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -58,7 +58,7 @@ pub fn findByMnemonic( next: for (mnemonic_to_encodings_map[@enumToInt(mnemonic)]) |data| { switch (data.mode) { .rex => if (!rex_required) continue, - .long, .sse2_long => {}, + .long, .sse_long, .sse2_long => {}, else => if (rex_required) continue, } for (input_ops, data.ops) |input_op, data_op| @@ -90,7 +90,7 @@ pub fn findByOpcode(opc: []const u8, prefixes: struct { if (prefixes.rex.w) { switch (data.mode) { .short, .fpu, .sse, .sse2, .sse4_1, .none => continue, - .long, .sse2_long, .rex => {}, + .long, .sse_long, .sse2_long, .rex => {}, } } else if (prefixes.rex.present and !prefixes.rex.isSet()) { switch (data.mode) { @@ -138,7 +138,7 @@ pub fn modRmExt(encoding: Encoding) u3 { pub fn operandBitSize(encoding: Encoding) u64 { switch (encoding.data.mode) { .short => return 16, - .long, .sse2_long => return 64, + .long, .sse_long, .sse2_long => return 64, else => {}, } const bit_size: u64 = switch (encoding.data.op_en) { @@ -163,7 +163,7 @@ pub fn format( _ = options; _ = fmt; switch (encoding.data.mode) { - .long, .sse2_long => try writer.writeAll("REX.W + "), + .long, .sse_long, .sse2_long => try writer.writeAll("REX.W + "), else => {}, } @@ -269,21 +269,25 @@ pub const Mnemonic = enum { // SSE addss, cmpss, + cvtsi2ss, divss, maxss, minss, movss, mulss, subss, ucomiss, + xorps, // SSE2 addsd, //cmpsd, + cvtsd2ss, cvtsi2sd, cvtss2sd, divsd, maxsd, minsd, movq, //movd, movsd, mulsd, subsd, ucomisd, + xorpd, // SSE4.1 roundss, roundsd, @@ -318,7 +322,7 @@ pub const Op = enum { m, moffs, sreg, - xmm, xmm_m32, xmm_m64, + xmm, xmm_m32, xmm_m64, xmm_m128, // zig fmt: on pub fn fromOperand(operand: Instruction.Operand) Op { @@ -400,7 +404,7 @@ pub const Op = enum { .imm32, .imm32s, .eax, .r32, .m32, .rm32, .rel32, .xmm_m32 => 32, .imm64, .rax, .r64, .m64, .rm64, .xmm_m64 => 64, .m80 => 80, - .m128, .xmm => 128, + .m128, .xmm, .xmm_m128 => 128, }; } @@ -423,8 +427,8 @@ pub const Op = enum { .al, .ax, .eax, .rax, .r8, .r16, .r32, .r64, .rm8, .rm16, .rm32, .rm64, - .xmm, .xmm_m32, .xmm_m64, - => true, + .xmm, .xmm_m32, .xmm_m64, .xmm_m128, + => true, else => false, }; // zig fmt: on @@ -449,7 +453,7 @@ pub const Op = enum { .rm8, .rm16, .rm32, .rm64, .m8, .m16, .m32, .m64, .m80, .m128, .m, - .xmm_m32, .xmm_m64, + .xmm_m32, .xmm_m64, .xmm_m128, => true, else => false, }; @@ -470,13 +474,13 @@ pub const Op = enum { .r8, .r16, .r32, .r64 => .general_purpose, .rm8, .rm16, .rm32, .rm64 => .general_purpose, .sreg => .segment, - .xmm, .xmm_m32, .xmm_m64 => .floating_point, + .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point, }; } pub fn isFloatingPointRegister(op: Op) bool { return switch (op) { - .xmm, .xmm_m32, .xmm_m64 => true, + .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => true, else => false, }; } @@ -535,6 +539,7 @@ pub const Mode = enum { rex, long, sse, + sse_long, sse2, sse2_long, sse4_1, -- cgit v1.2.3 From 9ccdbca635a3b5a26b65ab8e52533d3acc8f2f5e Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Tue, 2 May 2023 03:24:04 -0400 Subject: x86_64: implement fabs --- src/arch/x86_64/CodeGen.zig | 20 ++++++++++++++------ src/arch/x86_64/Encoding.zig | 6 ++++++ src/arch/x86_64/Lower.zig | 6 ++++++ src/arch/x86_64/Mir.zig | 12 ++++++++++++ src/arch/x86_64/encodings.zig | 12 ++++++++++++ test/behavior/floatop.zig | 3 ++- 6 files changed, 52 insertions(+), 7 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index e2a1076ce9..5685357108 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1458,14 +1458,13 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .log, .log2, .log10, - .fabs, .floor, .ceil, .round, .trunc_float, => try self.airUnaryMath(inst), - .neg => try self.airNeg(inst), + .neg, .fabs => try self.airFloatSign(inst), .add_with_overflow => try self.airAddSubWithOverflow(inst), .sub_with_overflow => try self.airAddSubWithOverflow(inst), @@ -4185,7 +4184,7 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void { return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } -fn airNeg(self: *Self, inst: Air.Inst.Index) !void { +fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void { const un_op = self.air.instructions.items(.data)[inst].un_op; const ty = self.air.typeOf(un_op); const ty_bits = ty.floatBits(self.target.*); @@ -4228,10 +4227,19 @@ fn airNeg(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockReg(dst_mcv.register); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + const tag = self.air.instructions.items(.tag)[inst]; try self.genBinOpMir(switch (ty_bits) { - 32 => .xorps, - 64 => .xorpd, - else => return self.fail("TODO implement airNeg for {}", .{ + 32 => switch (tag) { + .neg => .xorps, + .fabs => .andnps, + else => unreachable, + }, + 64 => switch (tag) { + .neg => .xorpd, + .fabs => .andnpd, + else => unreachable, + }, + else => return self.fail("TODO implement airFloatSign for {}", .{ ty.fmt(self.bin_file.options.module.?), }), }, vec_ty, dst_mcv, sign_mcv); diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 5cb7f7a2d9..bb1757c91c 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -268,23 +268,29 @@ pub const Mnemonic = enum { movd, // SSE addss, + andps, + andnps, cmpss, cvtsi2ss, divss, maxss, minss, movss, mulss, + orps, subss, ucomiss, xorps, // SSE2 addsd, + andpd, + andnpd, //cmpsd, cvtsd2ss, cvtsi2sd, cvtss2sd, divsd, maxsd, minsd, movq, //movd, movsd, mulsd, + orpd, subsd, ucomisd, xorpd, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index a961100687..03e395b171 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -94,6 +94,8 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .xor, .addss, + .andnps, + .andps, .cmpss, .cvtsi2ss, .divss, @@ -101,11 +103,14 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .minss, .movss, .mulss, + .orps, .roundss, .subss, .ucomiss, .xorps, .addsd, + .andnpd, + .andpd, .cmpsd, .cvtsd2ss, .cvtsi2sd, @@ -115,6 +120,7 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .minsd, .movsd, .mulsd, + .orpd, .roundsd, .subsd, .ucomisd, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index c14338b13d..f3d7a5a66f 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -168,6 +168,10 @@ pub const Inst = struct { /// Add single precision floating point values addss, + /// Bitwise logical and of packed single precision floating-point values + andps, + /// Bitwise logical and not of packed single precision floating-point values + andnps, /// Compare scalar single-precision floating-point values cmpss, /// Convert doubleword integer to scalar single-precision floating-point value @@ -182,6 +186,8 @@ pub const Inst = struct { movss, /// Multiply scalar single-precision floating-point values mulss, + /// Bitwise logical or of packed single precision floating-point values + orps, /// Round scalar single-precision floating-point values roundss, /// Subtract scalar single-precision floating-point values @@ -192,6 +198,10 @@ pub const Inst = struct { xorps, /// Add double precision floating point values addsd, + /// Bitwise logical and not of packed double precision floating-point values + andnpd, + /// Bitwise logical and of packed double precision floating-point values + andpd, /// Compare scalar double-precision floating-point values cmpsd, /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value @@ -210,6 +220,8 @@ pub const Inst = struct { movsd, /// Multiply scalar double-precision floating-point values mulsd, + /// Bitwise logical or of packed double precision floating-point values + orpd, /// Round scalar double-precision floating-point values roundsd, /// Subtract scalar double-precision floating-point values diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index ac427c3633..35b2f13fe7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -832,6 +832,10 @@ pub const table = [_]Entry{ // SSE .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .sse }, + .{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .sse }, + + .{ .andps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .sse }, + .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .sse }, .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .sse }, @@ -848,6 +852,8 @@ pub const table = [_]Entry{ .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .sse }, + .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .sse }, + .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .sse }, .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .sse }, @@ -857,6 +863,10 @@ pub const table = [_]Entry{ // SSE2 .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .sse2 }, + .{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .sse2 }, + + .{ .andpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x54 }, 0, .sse2 }, + .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .sse2 }, .{ .cvtsd2ss, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .sse2 }, @@ -883,6 +893,8 @@ pub const table = [_]Entry{ .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .sse2 }, + .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .sse2 }, + .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .sse2 }, .{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .sse2 }, diff --git a/test/behavior/floatop.zig b/test/behavior/floatop.zig index f713cd035c..ecf1473d14 100644 --- a/test/behavior/floatop.zig +++ b/test/behavior/floatop.zig @@ -96,7 +96,8 @@ test "negative f128 floatToInt at compile-time" { } test "@sqrt" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + comptime !std.Target.x86.featureSetHasAll(builtin.cpu.features, .{ .sse, .sse2, .sse4_1 })) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From 6893f90887836584f9377793cca7235d8947a326 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Wed, 3 May 2023 02:37:48 -0400 Subject: x86_64: implement sqrt --- src/arch/x86_64/CodeGen.zig | 28 +++++++++++++++++++++++++++- src/arch/x86_64/Encoding.zig | 4 ++++ src/arch/x86_64/Lower.zig | 4 ++++ src/arch/x86_64/Mir.zig | 8 ++++++++ src/arch/x86_64/encodings.zig | 6 ++++++ 5 files changed, 49 insertions(+), 1 deletion(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 9e50e6b5b4..ee621ffd87 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -229,6 +229,7 @@ pub const MCValue = union(enum) { fn isRegister(mcv: MCValue) bool { return switch (mcv) { .register => true, + .register_offset => |reg_off| return reg_off.off == 0, else => false, }; } @@ -1449,7 +1450,6 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .shl_sat => try self.airShlSat(inst), .slice => try self.airSlice(inst), - .sqrt, .sin, .cos, .tan, @@ -1464,6 +1464,7 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .trunc_float, => try self.airUnaryMath(inst), + .sqrt => try self.airSqrt(inst), .neg, .fabs => try self.airFloatSign(inst), .add_with_overflow => try self.airAddSubWithOverflow(inst), @@ -4242,6 +4243,31 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void { return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none }); } +fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { + const un_op = self.air.instructions.items(.data)[inst].un_op; + const ty = self.air.typeOf(un_op); + + const src_mcv = try self.resolveInst(un_op); + const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, un_op, 0, src_mcv)) + src_mcv + else + try self.copyToRegisterWithInstTracking(inst, ty, src_mcv); + + try self.genBinOpMir(switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .sqrtss, + 64 => .sqrtsd, + else => return self.fail("TODO implement airSqrt for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), + }, + else => return self.fail("TODO implement airSqrt for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), + }, ty, dst_mcv, src_mcv); + return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none }); +} + fn airUnaryMath(self: *Self, inst: Air.Inst.Index) !void { const un_op = self.air.instructions.items(.data)[inst].un_op; _ = un_op; diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index bb1757c91c..13d7b1776d 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -277,6 +277,8 @@ pub const Mnemonic = enum { movss, mulss, orps, + sqrtps, + sqrtss, subss, ucomiss, xorps, @@ -291,6 +293,8 @@ pub const Mnemonic = enum { movq, //movd, movsd, mulsd, orpd, + sqrtpd, + sqrtsd, subsd, ucomisd, xorpd, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 03e395b171..b369ba2a6b 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -105,6 +105,8 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .mulss, .orps, .roundss, + .sqrtps, + .sqrtss, .subss, .ucomiss, .xorps, @@ -122,6 +124,8 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .mulsd, .orpd, .roundsd, + .sqrtpd, + .sqrtsd, .subsd, .ucomisd, .xorpd, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index f3d7a5a66f..95efc0a96c 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -190,7 +190,11 @@ pub const Inst = struct { orps, /// Round scalar single-precision floating-point values roundss, + /// Square root of scalar single precision floating-point value + sqrtps, /// Subtract scalar single-precision floating-point values + sqrtss, + /// Square root of single precision floating-point values subss, /// Unordered compare scalar single-precision floating-point values ucomiss, @@ -224,6 +228,10 @@ pub const Inst = struct { orpd, /// Round scalar double-precision floating-point values roundsd, + /// Square root of double precision floating-point values + sqrtpd, + /// Square root of scalar double precision floating-point value + sqrtsd, /// Subtract scalar double-precision floating-point values subsd, /// Unordered compare scalar double-precision floating-point values diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 35b2f13fe7..5c443157c5 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -856,6 +856,9 @@ pub const table = [_]Entry{ .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .sse }, + .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .sse }, + .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .sse }, + .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .sse }, .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .sse }, @@ -895,6 +898,9 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .sse2 }, + .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .sse2 }, + .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .sse2 }, + .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .sse2 }, .{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .sse2 }, -- cgit v1.2.3 From a19faa2481e84e065a8762cb7c7cbf35426929fd Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Wed, 3 May 2023 04:21:40 -0400 Subject: x86_64: implement movement of more types * f16 * f128 * vector --- src/arch/x86_64/CodeGen.zig | 221 ++++++++++++++++++++++++++++-------------- src/arch/x86_64/Encoding.zig | 6 +- src/arch/x86_64/Lower.zig | 13 +++ src/arch/x86_64/Mir.zig | 14 +++ src/arch/x86_64/encodings.zig | 18 ++++ test/behavior/vector.zig | 6 -- 6 files changed, 198 insertions(+), 80 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index ee621ffd87..97e672b71f 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1210,6 +1210,28 @@ fn asmRegisterMemory(self: *Self, tag: Mir.Inst.Tag, reg: Register, m: Memory) ! }); } +fn asmRegisterMemoryImmediate( + self: *Self, + tag: Mir.Inst.Tag, + reg: Register, + m: Memory, + imm: Immediate, +) !void { + _ = try self.addInst(.{ + .tag = tag, + .ops = switch (m) { + .sib => .rmi_sib, + .rip => .rmi_rip, + else => unreachable, + }, + .data = .{ .rix = .{ .r = reg, .i = @intCast(u8, imm.unsigned), .payload = switch (m) { + .sib => try self.addExtra(Mir.MemorySib.encode(m)), + .rip => try self.addExtra(Mir.MemoryRip.encode(m)), + else => unreachable, + } } }, + }); +} + fn asmMemoryRegister(self: *Self, tag: Mir.Inst.Tag, m: Memory, reg: Register) !void { _ = try self.addInst(.{ .tag = tag, @@ -1951,7 +1973,7 @@ fn allocRegOrMemAdvanced(self: *Self, elem_ty: Type, inst: ?Air.Inst.Index, reg_ const ptr_bits = self.target.cpu.arch.ptrBitWidth(); const ptr_bytes: u64 = @divExact(ptr_bits, 8); if (abi_size <= ptr_bytes) { - if (self.register_manager.tryAllocReg(inst, try self.regClassForType(elem_ty))) |reg| { + if (self.register_manager.tryAllocReg(inst, regClassForType(elem_ty))) |reg| { return MCValue{ .register = registerAlias(reg, abi_size) }; } } @@ -1961,14 +1983,9 @@ fn allocRegOrMemAdvanced(self: *Self, elem_ty: Type, inst: ?Air.Inst.Index, reg_ return .{ .load_frame = .{ .index = frame_index } }; } -fn regClassForType(self: *Self, ty: Type) !RegisterManager.RegisterBitSet { +fn regClassForType(ty: Type) RegisterManager.RegisterBitSet { return switch (ty.zigTypeTag()) { - .Vector => self.fail("TODO regClassForType for {}", .{ty.fmt(self.bin_file.options.module.?)}), - .Float => switch (ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) sse else gp, - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) sse else gp, - else => gp, - }, + .Float, .Vector => sse, else => gp, }; } @@ -2111,7 +2128,7 @@ pub fn spillRegisters(self: *Self, registers: []const Register) !void { /// allocated. A second call to `copyToTmpRegister` may return the same register. /// This can have a side effect of spilling instructions to the stack to free up a register. fn copyToTmpRegister(self: *Self, ty: Type, mcv: MCValue) !Register { - const reg = try self.register_manager.allocReg(null, try self.regClassForType(ty)); + const reg = try self.register_manager.allocReg(null, regClassForType(ty)); try self.genSetReg(reg, ty, mcv); return reg; } @@ -2126,7 +2143,7 @@ fn copyToRegisterWithInstTracking( ty: Type, mcv: MCValue, ) !MCValue { - const reg: Register = try self.register_manager.allocReg(reg_owner, try self.regClassForType(ty)); + const reg: Register = try self.register_manager.allocReg(reg_owner, regClassForType(ty)); try self.genSetReg(reg, ty, mcv); return MCValue{ .register = reg }; } @@ -2159,8 +2176,7 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { if (dst_ty.floatBits(self.target.*) != 32 or src_ty.floatBits(self.target.*) != 64 or !Target.x86.featureSetHas(self.target.cpu.features, .sse2)) return self.fail("TODO implement airFptrunc from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); const src_mcv = try self.resolveInst(ty_op.operand); @@ -2182,8 +2198,7 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { if (dst_ty.floatBits(self.target.*) != 64 or src_ty.floatBits(self.target.*) != 32 or !Target.x86.featureSetHas(self.target.cpu.features, .sse2)) return self.fail("TODO implement airFpext from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); const src_mcv = try self.resolveInst(ty_op.operand); @@ -4436,8 +4451,8 @@ fn airLoad(self: *Self, inst: Air.Inst.Index) !void { const ptr_ty = self.air.typeOf(ty_op.operand); const elem_size = elem_ty.abiSize(self.target.*); - const elem_rc = try self.regClassForType(elem_ty); - const ptr_rc = try self.regClassForType(ptr_ty); + const elem_rc = regClassForType(elem_ty); + const ptr_rc = regClassForType(ptr_ty); const ptr_mcv = try self.resolveInst(ty_op.operand); const dst_mcv = if (elem_size <= 8 and elem_rc.supersetOf(ptr_rc) and @@ -5257,8 +5272,7 @@ fn genMulDivBinOp( .mul, .mulwrap => dst_abi_size != src_abi_size and dst_abi_size != src_abi_size * 2, .div_trunc, .div_floor, .div_exact, .rem, .mod => dst_abi_size != src_abi_size, } or src_abi_size > 8) return self.fail("TODO implement genMulDivBinOp from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); const ty = if (dst_abi_size <= 8) dst_ty else src_ty; const abi_size = if (dst_abi_size <= 8) dst_abi_size else src_abi_size; @@ -5558,7 +5572,9 @@ fn genBinOp( }, lhs_ty, dst_mcv, src_mcv), .mul => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?) }), + else => return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), .Float => switch (lhs_ty.floatBits(self.target.*)) { 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) .mulss @@ -5761,9 +5777,13 @@ fn genBinOp( .max => .maxsd, else => unreachable, }, - else => return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?) }), + else => return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), }, lhs_ty, dst_mcv, src_mcv), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?) }), + else => return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), }, else => unreachable, @@ -5802,8 +5822,7 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, ty: Type, dst_mcv: MCValue, s .Float => { if (!Target.x86.featureSetHas(self.target.cpu.features, .sse)) return self.fail("TODO genBinOpMir for {s} {} without sse", .{ - @tagName(mir_tag), - ty.fmt(self.bin_file.options.module.?), + @tagName(mir_tag), ty.fmt(self.bin_file.options.module.?), }); return self.asmRegisterRegister(mir_tag, dst_reg.to128(), src_reg.to128()); }, @@ -7588,10 +7607,11 @@ fn movMirTag(self: *Self, ty: Type) !Mir.Inst.Tag { return switch (ty.zigTypeTag()) { else => .mov, .Float => switch (ty.floatBits(self.target.*)) { - 16 => .mov, - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) .movss else .mov, - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) .movsd else .mov, - else => return self.fail("TODO movMirTag for {}", .{ + 16 => unreachable, // needs special handling + 32 => .movss, + 64 => .movsd, + 128 => .movaps, + else => return self.fail("TODO movMirTag from {}", .{ ty.fmt(self.bin_file.options.module.?), }), }, @@ -7700,8 +7720,17 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr }, .register => |src_reg| if (dst_reg.id() != src_reg.id()) try self.asmRegisterRegister( if ((dst_reg.class() == .floating_point) == (src_reg.class() == .floating_point)) - try self.movMirTag(ty) + switch (ty.zigTypeTag()) { + else => .mov, + .Float, .Vector => .movaps, + } else switch (abi_size) { + 2 => return try self.asmRegisterRegisterImmediate( + if (dst_reg.class() == .floating_point) .pinsrw else .pextrw, + registerAlias(dst_reg, abi_size), + registerAlias(src_reg, abi_size), + Immediate.u(0), + ), 4 => .movd, 8 => .movq, else => return self.fail( @@ -7712,18 +7741,12 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr registerAlias(dst_reg, abi_size), registerAlias(src_reg, abi_size), ), - .register_offset, .indirect, .load_frame, .lea_frame => try self.asmRegisterMemory( - switch (src_mcv) { - .register_offset => |reg_off| switch (reg_off.off) { - 0 => return self.genSetReg(dst_reg, ty, .{ .register = reg_off.reg }), - else => .lea, - }, - .indirect, .load_frame => try self.movMirTag(ty), - .lea_frame => .lea, - else => unreachable, - }, - registerAlias(dst_reg, abi_size), - Memory.sib(Memory.PtrSize.fromSize(abi_size), switch (src_mcv) { + .register_offset, + .indirect, + .load_frame, + .lea_frame, + => { + const src_mem = Memory.sib(Memory.PtrSize.fromSize(abi_size), switch (src_mcv) { .register_offset, .indirect => |reg_off| .{ .base = .{ .reg = reg_off.reg }, .disp = reg_off.off, @@ -7733,20 +7756,51 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr .disp = frame_addr.off, }, else => unreachable, - }), - ), + }); + if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16) + try self.asmRegisterMemoryImmediate( + .pinsrw, + registerAlias(dst_reg, abi_size), + src_mem, + Immediate.u(0), + ) + else + try self.asmRegisterMemory( + switch (src_mcv) { + .register_offset => |reg_off| switch (reg_off.off) { + 0 => return self.genSetReg(dst_reg, ty, .{ .register = reg_off.reg }), + else => .lea, + }, + .indirect, .load_frame => try self.movMirTag(ty), + .lea_frame => .lea, + else => unreachable, + }, + registerAlias(dst_reg, abi_size), + src_mem, + ); + }, .memory, .load_direct, .load_got, .load_tlv => { switch (src_mcv) { - .memory => |addr| if (math.cast(i32, @bitCast(i64, addr))) |small_addr| - return self.asmRegisterMemory( - try self.movMirTag(ty), - registerAlias(dst_reg, abi_size), - Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ - .base = .{ .reg = .ds }, - .disp = small_addr, - }), - ), - .load_direct => |sym_index| if (try self.movMirTag(ty) == .mov) { + .memory => |addr| if (math.cast(i32, @bitCast(i64, addr))) |small_addr| { + const src_mem = Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ + .base = .{ .reg = .ds }, + .disp = small_addr, + }); + return if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16) + self.asmRegisterMemoryImmediate( + .pinsrw, + registerAlias(dst_reg, abi_size), + src_mem, + Immediate.u(0), + ) + else + self.asmRegisterMemory( + try self.movMirTag(ty), + registerAlias(dst_reg, abi_size), + src_mem, + ); + }, + .load_direct => |sym_index| if (!ty.isRuntimeFloat()) { const atom_index = try self.owner.getSymbolIndex(self); _ = try self.addInst(.{ .tag = .mov_linker, @@ -7767,11 +7821,22 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr const addr_lock = self.register_manager.lockRegAssumeUnused(addr_reg); defer self.register_manager.unlockReg(addr_lock); - try self.asmRegisterMemory( - try self.movMirTag(ty), - registerAlias(dst_reg, abi_size), - Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = .{ .reg = addr_reg } }), - ); + const src_mem = Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ + .base = .{ .reg = addr_reg }, + }); + if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16) + try self.asmRegisterMemoryImmediate( + .pinsrw, + registerAlias(dst_reg, abi_size), + src_mem, + Immediate.u(0), + ) + else + try self.asmRegisterMemory( + try self.movMirTag(ty), + registerAlias(dst_reg, abi_size), + src_mem, + ); }, .lea_direct, .lea_got => |sym_index| { const atom_index = try self.owner.getSymbolIndex(self); @@ -7864,11 +7929,25 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal }, }, .eflags => |cc| try self.asmSetccMemory(Memory.sib(.byte, .{ .base = base, .disp = disp }), cc), - .register => |reg| try self.asmMemoryRegister( - try self.movMirTag(ty), - Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = base, .disp = disp }), - registerAlias(reg, abi_size), - ), + .register => |src_reg| { + const dst_mem = Memory.sib( + Memory.PtrSize.fromSize(abi_size), + .{ .base = base, .disp = disp }, + ); + if (ty.isRuntimeFloat() and ty.floatBits(self.target.*) == 16) + try self.asmMemoryRegisterImmediate( + .pextrw, + dst_mem, + registerAlias(src_reg, abi_size), + Immediate.u(0), + ) + else + try self.asmMemoryRegister( + try self.movMirTag(ty), + dst_mem, + registerAlias(src_reg, abi_size), + ); + }, .register_overflow => |ro| { try self.genSetMem( base, @@ -8071,8 +8150,8 @@ fn airBitCast(self: *Self, inst: Air.Inst.Index) !void { const src_ty = self.air.typeOf(ty_op.operand); const result = result: { - const dst_rc = try self.regClassForType(dst_ty); - const src_rc = try self.regClassForType(src_ty); + const dst_rc = regClassForType(dst_ty); + const src_rc = regClassForType(src_ty); const operand = try self.resolveInst(ty_op.operand); if (dst_rc.supersetOf(src_rc) and self.reuseOperand(inst, ty_op.operand, 0, operand)) break :result operand; @@ -8127,8 +8206,7 @@ fn airIntToFloat(self: *Self, inst: Air.Inst.Index) !void { .unsigned => src_bits + 1, }, 32), 8) catch unreachable; if (src_size > 8) return self.fail("TODO implement airIntToFloat from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); const src_mcv = try self.resolveInst(ty_op.operand); @@ -8141,7 +8219,7 @@ fn airIntToFloat(self: *Self, inst: Air.Inst.Index) !void { if (src_bits < src_size * 8) try self.truncateRegister(src_ty, src_reg); - const dst_reg = try self.register_manager.allocReg(inst, try self.regClassForType(dst_ty)); + const dst_reg = try self.register_manager.allocReg(inst, regClassForType(dst_ty)); const dst_mcv = MCValue{ .register = dst_reg }; const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg); defer self.register_manager.unlockReg(dst_lock); @@ -8151,19 +8229,16 @@ fn airIntToFloat(self: *Self, inst: Air.Inst.Index) !void { .cvtsi2ss else return self.fail("TODO implement airIntToFloat from {} to {} without sse", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }), 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) .cvtsi2sd else return self.fail("TODO implement airIntToFloat from {} to {} without sse2", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }), else => return self.fail("TODO implement airIntToFloat from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), - dst_ty.fmt(self.bin_file.options.module.?), + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }), }, dst_reg.to128(), registerAlias(src_reg, src_size)); diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 13d7b1776d..944fe85458 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -274,9 +274,11 @@ pub const Mnemonic = enum { cvtsi2ss, divss, maxss, minss, - movss, + movaps, movss, movups, mulss, orps, + pextrw, + pinsrw, sqrtps, sqrtss, subss, @@ -290,7 +292,9 @@ pub const Mnemonic = enum { cvtsd2ss, cvtsi2sd, cvtss2sd, divsd, maxsd, minsd, + movapd, movq, //movd, movsd, + movupd, mulsd, orpd, sqrtpd, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index b369ba2a6b..4289cfaf2a 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -101,9 +101,13 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .divss, .maxss, .minss, + .movaps, .movss, + .movups, .mulss, .orps, + .pextrw, + .pinsrw, .roundss, .sqrtps, .sqrtss, @@ -198,6 +202,8 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { .mi_rip_u, .lock_mi_sib_u, .lock_mi_rip_u, + .rmi_sib, + .rmi_rip, .mri_sib, .mri_rip, => Immediate.u(i), @@ -212,6 +218,7 @@ fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { return lower.mir.resolveFrameLoc(switch (ops) { .rm_sib, .rm_sib_cc, + .rmi_sib, .m_sib, .m_sib_cc, .mi_sib_u, @@ -227,6 +234,7 @@ fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { .rm_rip, .rm_rip_cc, + .rmi_rip, .m_rip, .m_rip_cc, .mi_rip_u, @@ -321,6 +329,11 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rx.r }, .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, }, + .rmi_sib, .rmi_rip => &.{ + .{ .reg = inst.data.rix.r }, + .{ .mem = lower.mem(inst.ops, inst.data.rix.payload) }, + .{ .imm = lower.imm(inst.ops, inst.data.rix.i) }, + }, .mr_sib, .lock_mr_sib, .mr_rip, .lock_mr_rip => &.{ .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, .{ .reg = inst.data.rx.r }, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 95efc0a96c..6b2db1b696 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -182,12 +182,20 @@ pub const Inst = struct { maxss, /// Return minimum single-precision floating-point value minss, + /// Move aligned packed single-precision floating-point values + movaps, /// Move scalar single-precision floating-point value movss, + /// Move unaligned packed single-precision floating-point values + movups, /// Multiply scalar single-precision floating-point values mulss, /// Bitwise logical or of packed single precision floating-point values orps, + /// Extract word + pextrw, + /// Insert word + pinsrw, /// Round scalar single-precision floating-point values roundss, /// Square root of scalar single precision floating-point value @@ -346,6 +354,12 @@ pub const Inst = struct { /// Register, memory (RIP) operands with condition code (CC). /// Uses `rx_cc` payload. rm_rip_cc, + /// Register, memory (SIB), immediate (byte) operands. + /// Uses `rix` payload with extra data of type `MemorySib`. + rmi_sib, + /// Register, memory (RIP), immediate (byte) operands. + /// Uses `rix` payload with extra data of type `MemoryRip`. + rmi_rip, /// Single memory (SIB) operand. /// Uses `payload` with extra data of type `MemorySib`. m_sib, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 5c443157c5..f87a110e99 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -847,9 +847,15 @@ pub const table = [_]Entry{ .{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .sse }, + .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .sse }, + .{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .sse }, + .{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .sse }, .{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .sse }, + .{ .movups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .sse }, + .{ .movups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .sse }, + .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .sse }, .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .sse }, @@ -885,6 +891,9 @@ pub const table = [_]Entry{ .{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .sse2 }, + .{ .movapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .sse2 }, + .{ .movapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .sse2 }, + .{ .movd, .rm, &.{ .xmm, .rm32 }, &.{ 0x66, 0x0f, 0x6e }, 0, .sse2 }, .{ .movd, .mr, &.{ .rm32, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .sse2 }, @@ -894,10 +903,17 @@ pub const table = [_]Entry{ .{ .movq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0x7e }, 0, .sse2 }, .{ .movq, .mr, &.{ .xmm_m64, .xmm }, &.{ 0x66, 0x0f, 0xd6 }, 0, .sse2 }, + .{ .movupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .sse2 }, + .{ .movupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .sse2 }, + .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .sse2 }, .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .sse2 }, + .{ .pextrw, .mri, &.{ .r16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .sse2 }, + + .{ .pinsrw, .rmi, &.{ .xmm, .rm16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .sse2 }, + .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .sse2 }, .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .sse2 }, @@ -911,6 +927,8 @@ pub const table = [_]Entry{ .{ .xorpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .sse2 }, // SSE4.1 + .{ .pextrw, .mri, &.{ .rm16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .sse4_1 }, + .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .sse4_1 }, .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .sse4_1 }, }; diff --git a/test/behavior/vector.zig b/test/behavior/vector.zig index 01c76310d7..2c55af5f85 100644 --- a/test/behavior/vector.zig +++ b/test/behavior/vector.zig @@ -133,7 +133,6 @@ test "vector bit operators" { } test "implicit cast vector to array" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -151,7 +150,6 @@ test "implicit cast vector to array" { } test "array to vector" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -321,7 +319,6 @@ test "load vector elements via comptime index" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO const S = struct { fn doTheTest() !void { @@ -343,7 +340,6 @@ test "store vector elements via comptime index" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO const S = struct { fn doTheTest() !void { @@ -371,7 +367,6 @@ test "load vector elements via runtime index" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO const S = struct { fn doTheTest() !void { @@ -393,7 +388,6 @@ test "store vector elements via runtime index" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO const S = struct { fn doTheTest() !void { -- cgit v1.2.3 From 32ab930f1d39c374265ae14f1de9d837dcd7f650 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Fri, 5 May 2023 01:32:39 -0400 Subject: x86_64: implement f16 conversions when supported --- src/arch/x86_64/CodeGen.zig | 66 +- src/arch/x86_64/Encoding.zig | 95 ++- src/arch/x86_64/Lower.zig | 3 + src/arch/x86_64/Mir.zig | 5 + src/arch/x86_64/encoder.zig | 160 +++- src/arch/x86_64/encodings.zig | 1746 +++++++++++++++++++++-------------------- test/behavior/vector.zig | 3 +- 7 files changed, 1151 insertions(+), 927 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 55b18985da..b7fd81db68 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2172,12 +2172,9 @@ fn airRetPtr(self: *Self, inst: Air.Inst.Index) !void { fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; const dst_ty = self.air.typeOfIndex(inst); + const dst_bits = dst_ty.floatBits(self.target.*); const src_ty = self.air.typeOf(ty_op.operand); - if (dst_ty.floatBits(self.target.*) != 32 or src_ty.floatBits(self.target.*) != 64 or - !Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - return self.fail("TODO implement airFptrunc from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), - }); + const src_bits = src_ty.floatBits(self.target.*); const src_mcv = try self.resolveInst(ty_op.operand); const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) @@ -2187,19 +2184,32 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockReg(dst_mcv.register); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - try self.genBinOpMir(.cvtsd2ss, src_ty, dst_mcv, src_mcv); + if (src_bits == 32 and dst_bits == 16 and self.hasFeature(.f16c)) + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_mcv.register, + if (src_mcv.isRegister()) src_mcv.getReg().? else src_reg: { + const src_reg = dst_mcv.register; + try self.genSetReg(src_reg, src_ty, src_mcv); + break :src_reg src_reg; + }, + Immediate.u(0b1_00), + ) + else if (src_bits == 64 and dst_bits == 32) + try self.genBinOpMir(.cvtsd2ss, src_ty, dst_mcv, src_mcv) + else + return self.fail("TODO implement airFptrunc from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }); return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } fn airFpext(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; const dst_ty = self.air.typeOfIndex(inst); + const dst_bits = dst_ty.floatBits(self.target.*); const src_ty = self.air.typeOf(ty_op.operand); - if (dst_ty.floatBits(self.target.*) != 64 or src_ty.floatBits(self.target.*) != 32 or - !Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - return self.fail("TODO implement airFpext from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), - }); + const src_bits = src_ty.floatBits(self.target.*); const src_mcv = try self.resolveInst(ty_op.operand); const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) @@ -2209,7 +2219,19 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockReg(dst_mcv.register); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - try self.genBinOpMir(.cvtss2sd, src_ty, dst_mcv, src_mcv); + try self.genBinOpMir( + if (src_bits == 16 and dst_bits == 32 and self.hasFeature(.f16c)) + .vcvtph2ps + else if (src_bits == 32 and dst_bits == 64) + .cvtss2sd + else + return self.fail("TODO implement airFpext from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }), + src_ty, + dst_mcv, + src_mcv, + ); return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } @@ -3802,7 +3824,7 @@ fn airClz(self: *Self, inst: Air.Inst.Index) !void { defer self.register_manager.unlockReg(dst_lock); const src_bits = src_ty.bitSize(self.target.*); - if (Target.x86.featureSetHas(self.target.cpu.features, .lzcnt)) { + if (self.hasFeature(.lzcnt)) { if (src_bits <= 64) { try self.genBinOpMir(.lzcnt, src_ty, dst_mcv, mat_src_mcv); @@ -3888,7 +3910,7 @@ fn airCtz(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockReg(dst_reg); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - if (Target.x86.featureSetHas(self.target.cpu.features, .bmi)) { + if (self.hasFeature(.bmi)) { if (src_bits <= 64) { const extra_bits = self.regExtraBits(src_ty); const masked_mcv = if (extra_bits > 0) masked: { @@ -3956,7 +3978,7 @@ fn airPopcount(self: *Self, inst: Air.Inst.Index) !void { const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*)); const src_mcv = try self.resolveInst(ty_op.operand); - if (Target.x86.featureSetHas(self.target.cpu.features, .popcnt)) { + if (self.hasFeature(.popcnt)) { const mat_src_mcv = switch (src_mcv) { .immediate => MCValue{ .register = try self.copyToTmpRegister(src_ty, src_mcv) }, else => src_mcv, @@ -4309,7 +4331,7 @@ fn airRound(self: *Self, inst: Air.Inst.Index, mode: Immediate) !void { const un_op = self.air.instructions.items(.data)[inst].un_op; const ty = self.air.typeOf(un_op); - if (!Target.x86.featureSetHas(self.target.cpu.features, .sse4_1)) + if (!self.hasFeature(.sse4_1)) return self.fail("TODO implement airRound without sse4_1 feature", .{}); const src_mcv = try self.resolveInst(un_op); @@ -5712,7 +5734,7 @@ fn genBinOp( => {}, .div_trunc, .div_floor, - => if (Target.x86.featureSetHas(self.target.cpu.features, .sse4_1)) { + => if (self.hasFeature(.sse4_1)) { const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*)); const dst_alias = registerAlias(dst_mcv.register, abi_size); try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) { @@ -9593,3 +9615,13 @@ fn regBitSize(self: *Self, ty: Type) u64 { fn regExtraBits(self: *Self, ty: Type) u64 { return self.regBitSize(ty) - ty.bitSize(self.target.*); } + +fn hasFeature(self: *Self, feature: Target.x86.Feature) bool { + return Target.x86.featureSetHas(self.target.cpu.features, feature); +} +fn hasAnyFeatures(self: *Self, features: anytype) bool { + return Target.x86.featureSetHasAny(self.target.cpu.features, features); +} +fn hasAllFeatures(self: *Self, features: anytype) bool { + return Target.x86.featureSetHasAll(self.target.cpu.features, features); +} diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 944fe85458..05c48ecddf 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -23,6 +23,7 @@ const Data = struct { opc: [7]u8, modrm_ext: u3, mode: Mode, + feature: Feature, }; pub fn findByMnemonic( @@ -58,7 +59,7 @@ pub fn findByMnemonic( next: for (mnemonic_to_encodings_map[@enumToInt(mnemonic)]) |data| { switch (data.mode) { .rex => if (!rex_required) continue, - .long, .sse_long, .sse2_long => {}, + .long => {}, else => if (rex_required) continue, } for (input_ops, data.ops) |input_op, data_op| @@ -136,22 +137,20 @@ pub fn modRmExt(encoding: Encoding) u3 { } pub fn operandBitSize(encoding: Encoding) u64 { - switch (encoding.data.mode) { - .short => return 16, - .long, .sse_long, .sse2_long => return 64, - else => {}, - } - const bit_size: u64 = switch (encoding.data.op_en) { - .np => switch (encoding.data.ops[0]) { - .o16 => 16, - .o32 => 32, - .o64 => 64, - else => 32, + return switch (encoding.data.mode) { + .short => 16, + .long => 64, + else => switch (encoding.data.op_en) { + .np => switch (encoding.data.ops[0]) { + .o16 => 16, + .o32 => 32, + .o64 => 64, + else => 32, + }, + .td => encoding.data.ops[1].bitSize(), + else => encoding.data.ops[0].bitSize(), }, - .td => encoding.data.ops[1].bitSize(), - else => encoding.data.ops[0].bitSize(), }; - return bit_size; } pub fn format( @@ -162,12 +161,50 @@ pub fn format( ) !void { _ = options; _ = fmt; + + var opc = encoding.opcode(); switch (encoding.data.mode) { - .long, .sse_long, .sse2_long => try writer.writeAll("REX.W + "), else => {}, + .long => try writer.writeAll("REX.W + "), + .vex_128, .vex_128_long, .vex_256, .vex_256_long => { + try writer.writeAll("VEX."); + + switch (encoding.data.mode) { + .vex_128, .vex_128_long => try writer.writeAll("128"), + .vex_256, .vex_256_long => try writer.writeAll("256"), + else => unreachable, + } + + switch (opc[0]) { + else => {}, + 0x66, 0xf3, 0xf2 => { + try writer.print(".{X:0>2}", .{opc[0]}); + opc = opc[1..]; + }, + } + + try writer.print(".{X:0>2}", .{opc[0]}); + opc = opc[1..]; + + switch (opc[0]) { + else => {}, + 0x38, 0x3A => { + try writer.print("{X:0>2}", .{opc[0]}); + opc = opc[1..]; + }, + } + + try writer.writeByte('.'); + try writer.writeAll(switch (encoding.data.mode) { + .vex_128, .vex_256 => "W0", + .vex_128_long, .vex_256_long => "W1", + else => unreachable, + }); + try writer.writeByte(' '); + }, } - for (encoding.opcode()) |byte| { + for (opc) |byte| { try writer.print("{x:0>2} ", .{byte}); } @@ -184,15 +221,16 @@ pub fn format( try writer.print("+{s} ", .{tag}); }, .m, .mi, .m1, .mc => try writer.print("/{d} ", .{encoding.modRmExt()}), - .mr, .rm, .rmi, .mri, .mrc => try writer.writeAll("/r "), + .mr, .rm, .rmi, .mri, .mrc, .rrm, .rrmi => try writer.writeAll("/r "), } switch (encoding.data.op_en) { - .i, .d, .zi, .oi, .mi, .rmi, .mri => { + .i, .d, .zi, .oi, .mi, .rmi, .mri, .rrmi => { const op = switch (encoding.data.op_en) { .i, .d => encoding.data.ops[0], .zi, .oi, .mi => encoding.data.ops[1], .rmi, .mri => encoding.data.ops[2], + .rrmi => encoding.data.ops[3], else => unreachable, }; const tag = switch (op) { @@ -207,7 +245,7 @@ pub fn format( }; try writer.print("{s} ", .{tag}); }, - .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc => {}, + .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rrm => {}, } try writer.print("{s} ", .{@tagName(encoding.mnemonic)}); @@ -305,6 +343,8 @@ pub const Mnemonic = enum { // SSE4.1 roundss, roundsd, + // F16C + vcvtph2ps, vcvtps2ph, // zig fmt: on }; @@ -317,6 +357,7 @@ pub const OpEn = enum { fd, td, m1, mc, mi, mr, rm, rmi, mri, mrc, + rrm, rrmi, // zig fmt: on }; @@ -549,14 +590,21 @@ pub const Op = enum { pub const Mode = enum { none, short, - fpu, rex, long, + vex_128, + vex_128_long, + vex_256, + vex_256_long, +}; + +pub const Feature = enum { + none, + f16c, sse, - sse_long, sse2, - sse2_long, sse4_1, + x87, }; fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Operand) usize { @@ -593,6 +641,7 @@ const mnemonic_to_encodings_map = init: { .opc = undefined, .modrm_ext = entry[4], .mode = entry[5], + .feature = entry[6], }; // TODO: use `@memcpy` for these. When I did that, I got a false positive // compile error for this copy happening at compile time. diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 4289cfaf2a..9571f50e7c 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -133,6 +133,9 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .subsd, .ucomisd, .xorpd, + + .vcvtph2ps, + .vcvtps2ph, => try lower.mirGeneric(inst), .cmps, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 6b2db1b696..c4e19fdc0e 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -247,6 +247,11 @@ pub const Inst = struct { /// Bitwise logical xor of packed double precision floating-point values xorpd, + /// Convert 16-bit floating-point values to single-precision floating-point values + vcvtph2ps, + /// Convert single-precision floating-point values to 16-bit floating-point values + vcvtps2ph, + /// Compare string operands cmps, /// Load string diff --git a/src/arch/x86_64/encoder.zig b/src/arch/x86_64/encoder.zig index 4c900697f5..94f4eb56d5 100644 --- a/src/arch/x86_64/encoder.zig +++ b/src/arch/x86_64/encoder.zig @@ -209,10 +209,19 @@ pub const Instruction = struct { const enc = inst.encoding; const data = enc.data; - try inst.encodeLegacyPrefixes(encoder); - try inst.encodeMandatoryPrefix(encoder); - try inst.encodeRexPrefix(encoder); - try inst.encodeOpcode(encoder); + switch (data.mode) { + .none, .short, .rex, .long => { + try inst.encodeLegacyPrefixes(encoder); + try inst.encodeMandatoryPrefix(encoder); + try inst.encodeRexPrefix(encoder); + try inst.encodeOpcode(encoder); + }, + .vex_128, .vex_128_long, .vex_256, .vex_256_long => { + try inst.encodeVexPrefix(encoder); + const opc = inst.encoding.opcode(); + try encoder.opcode_1byte(opc[opc.len - 1]); + }, + } switch (data.op_en) { .np, .o => {}, @@ -309,6 +318,7 @@ pub const Instruction = struct { } else null, + .rrm, .rrmi => unreachable, }; if (segment_override) |seg| { legacy.setSegmentOverride(seg); @@ -322,10 +332,7 @@ pub const Instruction = struct { var rex = Rex{}; rex.present = inst.encoding.data.mode == .rex; - switch (inst.encoding.data.mode) { - .long, .sse_long, .sse2_long => rex.w = true, - else => {}, - } + rex.w = inst.encoding.data.mode == .long; switch (op_en) { .np, .i, .zi, .fd, .td, .d => {}, @@ -346,11 +353,76 @@ pub const Instruction = struct { rex.b = b_x_op.isBaseExtended(); rex.x = b_x_op.isIndexExtended(); }, + .rrm, .rrmi => unreachable, } try encoder.rex(rex); } + fn encodeVexPrefix(inst: Instruction, encoder: anytype) !void { + const op_en = inst.encoding.data.op_en; + const opc = inst.encoding.opcode(); + const mand_pre = inst.encoding.mandatoryPrefix(); + + var vex = Vex{}; + + vex.w = switch (inst.encoding.data.mode) { + .vex_128, .vex_256 => false, + .vex_128_long, .vex_256_long => true, + else => unreachable, + }; + + switch (op_en) { + .np, .i, .zi, .fd, .td, .d => {}, + .o, .oi => vex.b = inst.ops[0].reg.isExtended(), + .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rrm, .rrmi => { + const r_op = switch (op_en) { + .rm, .rmi, .rrm, .rrmi => inst.ops[0], + .mr, .mri, .mrc => inst.ops[1], + else => .none, + }; + vex.r = r_op.isBaseExtended(); + + const b_x_op = switch (op_en) { + .rm, .rmi => inst.ops[1], + .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0], + .rrm, .rrmi => inst.ops[2], + else => unreachable, + }; + vex.b = b_x_op.isBaseExtended(); + vex.x = b_x_op.isIndexExtended(); + }, + } + + vex.l = switch (inst.encoding.data.mode) { + .vex_128, .vex_128_long => false, + .vex_256, .vex_256_long => true, + else => unreachable, + }; + + vex.p = if (mand_pre) |mand| switch (mand) { + 0x66 => .@"66", + 0xf2 => .f2, + 0xf3 => .f3, + else => unreachable, + } else .none; + + const leading: usize = if (mand_pre) |_| 1 else 0; + assert(opc[leading] == 0x0f); + vex.m = switch (opc[leading + 1]) { + else => .@"0f", + 0x38 => .@"0f38", + 0x3a => .@"0f3a", + }; + + switch (op_en) { + else => {}, + .rrm, .rrmi => vex.v = inst.ops[1].reg, + } + + try encoder.vex(vex); + } + fn encodeMandatoryPrefix(inst: Instruction, encoder: anytype) !void { const prefix = inst.encoding.mandatoryPrefix() orelse return; try encoder.opcode_1byte(prefix); @@ -562,17 +634,48 @@ fn Encoder(comptime T: type, comptime opts: Options) type { /// or one of reg, index, r/m, base, or opcode-reg might be extended. /// /// See struct `Rex` for a description of each field. - pub fn rex(self: Self, byte: Rex) !void { - if (!byte.present and !byte.isSet()) return; + pub fn rex(self: Self, fields: Rex) !void { + if (!fields.present and !fields.isSet()) return; + + var byte: u8 = 0b0100_0000; - var value: u8 = 0b0100_0000; + if (fields.w) byte |= 0b1000; + if (fields.r) byte |= 0b0100; + if (fields.x) byte |= 0b0010; + if (fields.b) byte |= 0b0001; - if (byte.w) value |= 0b1000; - if (byte.r) value |= 0b0100; - if (byte.x) value |= 0b0010; - if (byte.b) value |= 0b0001; + try self.writer.writeByte(byte); + } - try self.writer.writeByte(value); + /// Encodes a VEX prefix given all the fields + /// + /// See struct `Vex` for a description of each field. + pub fn vex(self: Self, fields: Vex) !void { + if (fields.is3Byte()) { + try self.writer.writeByte(0b1100_0100); + + try self.writer.writeByte( + @as(u8, ~@boolToInt(fields.r)) << 7 | + @as(u8, ~@boolToInt(fields.x)) << 6 | + @as(u8, ~@boolToInt(fields.b)) << 5 | + @as(u8, @enumToInt(fields.m)) << 0, + ); + + try self.writer.writeByte( + @as(u8, @boolToInt(fields.w)) << 7 | + @as(u8, ~fields.v.enc()) << 3 | + @as(u8, @boolToInt(fields.l)) << 2 | + @as(u8, @enumToInt(fields.p)) << 0, + ); + } else { + try self.writer.writeByte(0b1100_0101); + try self.writer.writeByte( + @as(u8, ~@boolToInt(fields.r)) << 7 | + @as(u8, ~fields.v.enc()) << 3 | + @as(u8, @boolToInt(fields.l)) << 2 | + @as(u8, @enumToInt(fields.p)) << 0, + ); + } } // ------ @@ -848,6 +951,31 @@ pub const Rex = struct { } }; +pub const Vex = struct { + w: bool = false, + r: bool = false, + x: bool = false, + b: bool = false, + l: bool = false, + p: enum(u2) { + none = 0b00, + @"66" = 0b01, + f3 = 0b10, + f2 = 0b11, + } = .none, + m: enum(u5) { + @"0f" = 0b0_0001, + @"0f38" = 0b0_0010, + @"0f3a" = 0b0_0011, + _, + } = .@"0f", + v: Register = .ymm0, + + pub fn is3Byte(vex: Vex) bool { + return vex.w or vex.x or vex.b or vex.m != .@"0f"; + } +}; + // Tests fn expectEqualHexStrings(expected: []const u8, given: []const u8, assembly: []const u8) !void { assert(expected.len > 0); diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index f87a110e99..52b8cc29d6 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -3,933 +3,939 @@ const Mnemonic = Encoding.Mnemonic; const OpEn = Encoding.OpEn; const Op = Encoding.Op; const Mode = Encoding.Mode; +const Feature = Encoding.Feature; const modrm_ext = u3; -pub const Entry = struct { Mnemonic, OpEn, []const Op, []const u8, modrm_ext, Mode }; +pub const Entry = struct { Mnemonic, OpEn, []const Op, []const u8, modrm_ext, Mode, Feature }; // TODO move this into a .zon file when Zig is capable of importing .zon files // zig fmt: off pub const table = [_]Entry{ // General-purpose - .{ .adc, .zi, &.{ .al, .imm8 }, &.{ 0x14 }, 0, .none }, - .{ .adc, .zi, &.{ .ax, .imm16 }, &.{ 0x15 }, 0, .none }, - .{ .adc, .zi, &.{ .eax, .imm32 }, &.{ 0x15 }, 0, .none }, - .{ .adc, .zi, &.{ .rax, .imm32s }, &.{ 0x15 }, 0, .long }, - .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .none }, - .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .rex }, - .{ .adc, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 2, .none }, - .{ .adc, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 2, .none }, - .{ .adc, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 2, .long }, - .{ .adc, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 2, .none }, - .{ .adc, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 2, .none }, - .{ .adc, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 2, .long }, - .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .none }, - .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .rex }, - .{ .adc, .mr, &.{ .rm16, .r16 }, &.{ 0x11 }, 0, .none }, - .{ .adc, .mr, &.{ .rm32, .r32 }, &.{ 0x11 }, 0, .none }, - .{ .adc, .mr, &.{ .rm64, .r64 }, &.{ 0x11 }, 0, .long }, - .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .none }, - .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .rex }, - .{ .adc, .rm, &.{ .r16, .rm16 }, &.{ 0x13 }, 0, .none }, - .{ .adc, .rm, &.{ .r32, .rm32 }, &.{ 0x13 }, 0, .none }, - .{ .adc, .rm, &.{ .r64, .rm64 }, &.{ 0x13 }, 0, .long }, - - .{ .add, .zi, &.{ .al, .imm8 }, &.{ 0x04 }, 0, .none }, - .{ .add, .zi, &.{ .ax, .imm16 }, &.{ 0x05 }, 0, .none }, - .{ .add, .zi, &.{ .eax, .imm32 }, &.{ 0x05 }, 0, .none }, - .{ .add, .zi, &.{ .rax, .imm32s }, &.{ 0x05 }, 0, .long }, - .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .none }, - .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .rex }, - .{ .add, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 0, .none }, - .{ .add, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 0, .none }, - .{ .add, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 0, .long }, - .{ .add, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 0, .none }, - .{ .add, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 0, .none }, - .{ .add, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 0, .long }, - .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .none }, - .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .rex }, - .{ .add, .mr, &.{ .rm16, .r16 }, &.{ 0x01 }, 0, .none }, - .{ .add, .mr, &.{ .rm32, .r32 }, &.{ 0x01 }, 0, .none }, - .{ .add, .mr, &.{ .rm64, .r64 }, &.{ 0x01 }, 0, .long }, - .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .none }, - .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .rex }, - .{ .add, .rm, &.{ .r16, .rm16 }, &.{ 0x03 }, 0, .none }, - .{ .add, .rm, &.{ .r32, .rm32 }, &.{ 0x03 }, 0, .none }, - .{ .add, .rm, &.{ .r64, .rm64 }, &.{ 0x03 }, 0, .long }, - - .{ .@"and", .zi, &.{ .al, .imm8 }, &.{ 0x24 }, 0, .none }, - .{ .@"and", .zi, &.{ .ax, .imm16 }, &.{ 0x25 }, 0, .none }, - .{ .@"and", .zi, &.{ .eax, .imm32 }, &.{ 0x25 }, 0, .none }, - .{ .@"and", .zi, &.{ .rax, .imm32s }, &.{ 0x25 }, 0, .long }, - .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .none }, - .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .rex }, - .{ .@"and", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 4, .none }, - .{ .@"and", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 4, .none }, - .{ .@"and", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 4, .long }, - .{ .@"and", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 4, .none }, - .{ .@"and", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 4, .none }, - .{ .@"and", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 4, .long }, - .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .none }, - .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .rex }, - .{ .@"and", .mr, &.{ .rm16, .r16 }, &.{ 0x21 }, 0, .none }, - .{ .@"and", .mr, &.{ .rm32, .r32 }, &.{ 0x21 }, 0, .none }, - .{ .@"and", .mr, &.{ .rm64, .r64 }, &.{ 0x21 }, 0, .long }, - .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .none }, - .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .rex }, - .{ .@"and", .rm, &.{ .r16, .rm16 }, &.{ 0x23 }, 0, .none }, - .{ .@"and", .rm, &.{ .r32, .rm32 }, &.{ 0x23 }, 0, .none }, - .{ .@"and", .rm, &.{ .r64, .rm64 }, &.{ 0x23 }, 0, .long }, - - .{ .bsf, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbc }, 0, .none }, - .{ .bsf, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbc }, 0, .none }, - .{ .bsf, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbc }, 0, .long }, - - .{ .bsr, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbd }, 0, .none }, - .{ .bsr, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbd }, 0, .none }, - .{ .bsr, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbd }, 0, .long }, - - .{ .bswap, .o, &.{ .r32 }, &.{ 0x0f, 0xc8 }, 0, .none }, - .{ .bswap, .o, &.{ .r64 }, &.{ 0x0f, 0xc8 }, 0, .long }, - - .{ .bt, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xa3 }, 0, .none }, - .{ .bt, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xa3 }, 0, .none }, - .{ .bt, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xa3 }, 0, .long }, - .{ .bt, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 4, .none }, - .{ .bt, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 4, .none }, - .{ .bt, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 4, .long }, - - .{ .btc, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xbb }, 0, .none }, - .{ .btc, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xbb }, 0, .none }, - .{ .btc, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xbb }, 0, .long }, - .{ .btc, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 7, .none }, - .{ .btc, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 7, .none }, - .{ .btc, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 7, .long }, - - .{ .btr, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb3 }, 0, .none }, - .{ .btr, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb3 }, 0, .none }, - .{ .btr, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb3 }, 0, .long }, - .{ .btr, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 6, .none }, - .{ .btr, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 6, .none }, - .{ .btr, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 6, .long }, - - .{ .bts, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xab }, 0, .none }, - .{ .bts, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xab }, 0, .none }, - .{ .bts, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xab }, 0, .long }, - .{ .bts, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 5, .none }, - .{ .bts, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 5, .none }, - .{ .bts, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 5, .long }, + .{ .adc, .zi, &.{ .al, .imm8 }, &.{ 0x14 }, 0, .none, .none }, + .{ .adc, .zi, &.{ .ax, .imm16 }, &.{ 0x15 }, 0, .none, .none }, + .{ .adc, .zi, &.{ .eax, .imm32 }, &.{ 0x15 }, 0, .none, .none }, + .{ .adc, .zi, &.{ .rax, .imm32s }, &.{ 0x15 }, 0, .long, .none }, + .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .rex, .none }, + .{ .adc, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 2, .long, .none }, + .{ .adc, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 2, .long, .none }, + .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .none, .none }, + .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .rex, .none }, + .{ .adc, .mr, &.{ .rm16, .r16 }, &.{ 0x11 }, 0, .none, .none }, + .{ .adc, .mr, &.{ .rm32, .r32 }, &.{ 0x11 }, 0, .none, .none }, + .{ .adc, .mr, &.{ .rm64, .r64 }, &.{ 0x11 }, 0, .long, .none }, + .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .none, .none }, + .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .rex, .none }, + .{ .adc, .rm, &.{ .r16, .rm16 }, &.{ 0x13 }, 0, .none, .none }, + .{ .adc, .rm, &.{ .r32, .rm32 }, &.{ 0x13 }, 0, .none, .none }, + .{ .adc, .rm, &.{ .r64, .rm64 }, &.{ 0x13 }, 0, .long, .none }, + + .{ .add, .zi, &.{ .al, .imm8 }, &.{ 0x04 }, 0, .none, .none }, + .{ .add, .zi, &.{ .ax, .imm16 }, &.{ 0x05 }, 0, .none, .none }, + .{ .add, .zi, &.{ .eax, .imm32 }, &.{ 0x05 }, 0, .none, .none }, + .{ .add, .zi, &.{ .rax, .imm32s }, &.{ 0x05 }, 0, .long, .none }, + .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .rex, .none }, + .{ .add, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 0, .long, .none }, + .{ .add, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 0, .long, .none }, + .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .none, .none }, + .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .rex, .none }, + .{ .add, .mr, &.{ .rm16, .r16 }, &.{ 0x01 }, 0, .none, .none }, + .{ .add, .mr, &.{ .rm32, .r32 }, &.{ 0x01 }, 0, .none, .none }, + .{ .add, .mr, &.{ .rm64, .r64 }, &.{ 0x01 }, 0, .long, .none }, + .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .none, .none }, + .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .rex, .none }, + .{ .add, .rm, &.{ .r16, .rm16 }, &.{ 0x03 }, 0, .none, .none }, + .{ .add, .rm, &.{ .r32, .rm32 }, &.{ 0x03 }, 0, .none, .none }, + .{ .add, .rm, &.{ .r64, .rm64 }, &.{ 0x03 }, 0, .long, .none }, + + .{ .@"and", .zi, &.{ .al, .imm8 }, &.{ 0x24 }, 0, .none, .none }, + .{ .@"and", .zi, &.{ .ax, .imm16 }, &.{ 0x25 }, 0, .none, .none }, + .{ .@"and", .zi, &.{ .eax, .imm32 }, &.{ 0x25 }, 0, .none, .none }, + .{ .@"and", .zi, &.{ .rax, .imm32s }, &.{ 0x25 }, 0, .long, .none }, + .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .rex, .none }, + .{ .@"and", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 4, .long, .none }, + .{ .@"and", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 4, .long, .none }, + .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .none, .none }, + .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .rex, .none }, + .{ .@"and", .mr, &.{ .rm16, .r16 }, &.{ 0x21 }, 0, .none, .none }, + .{ .@"and", .mr, &.{ .rm32, .r32 }, &.{ 0x21 }, 0, .none, .none }, + .{ .@"and", .mr, &.{ .rm64, .r64 }, &.{ 0x21 }, 0, .long, .none }, + .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .none, .none }, + .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .rex, .none }, + .{ .@"and", .rm, &.{ .r16, .rm16 }, &.{ 0x23 }, 0, .none, .none }, + .{ .@"and", .rm, &.{ .r32, .rm32 }, &.{ 0x23 }, 0, .none, .none }, + .{ .@"and", .rm, &.{ .r64, .rm64 }, &.{ 0x23 }, 0, .long, .none }, + + .{ .bsf, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbc }, 0, .none, .none }, + .{ .bsf, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbc }, 0, .none, .none }, + .{ .bsf, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbc }, 0, .long, .none }, + + .{ .bsr, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbd }, 0, .none, .none }, + .{ .bsr, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbd }, 0, .none, .none }, + .{ .bsr, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbd }, 0, .long, .none }, + + .{ .bswap, .o, &.{ .r32 }, &.{ 0x0f, 0xc8 }, 0, .none, .none }, + .{ .bswap, .o, &.{ .r64 }, &.{ 0x0f, 0xc8 }, 0, .long, .none }, + + .{ .bt, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xa3 }, 0, .none, .none }, + .{ .bt, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xa3 }, 0, .none, .none }, + .{ .bt, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xa3 }, 0, .long, .none }, + .{ .bt, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 4, .none, .none }, + .{ .bt, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 4, .none, .none }, + .{ .bt, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 4, .long, .none }, + + .{ .btc, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xbb }, 0, .none, .none }, + .{ .btc, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xbb }, 0, .none, .none }, + .{ .btc, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xbb }, 0, .long, .none }, + .{ .btc, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 7, .none, .none }, + .{ .btc, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 7, .none, .none }, + .{ .btc, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 7, .long, .none }, + + .{ .btr, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb3 }, 0, .none, .none }, + .{ .btr, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb3 }, 0, .none, .none }, + .{ .btr, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb3 }, 0, .long, .none }, + .{ .btr, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 6, .none, .none }, + .{ .btr, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 6, .none, .none }, + .{ .btr, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 6, .long, .none }, + + .{ .bts, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xab }, 0, .none, .none }, + .{ .bts, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xab }, 0, .none, .none }, + .{ .bts, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xab }, 0, .long, .none }, + .{ .bts, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 5, .none, .none }, + .{ .bts, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 5, .none, .none }, + .{ .bts, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 5, .long, .none }, // This is M encoding according to Intel, but D makes more sense here. - .{ .call, .d, &.{ .rel32 }, &.{ 0xe8 }, 0, .none }, - .{ .call, .m, &.{ .rm64 }, &.{ 0xff }, 2, .none }, - - .{ .cbw, .np, &.{ .o16 }, &.{ 0x98 }, 0, .none }, - .{ .cwde, .np, &.{ .o32 }, &.{ 0x98 }, 0, .none }, - .{ .cdqe, .np, &.{ .o64 }, &.{ 0x98 }, 0, .long }, - - .{ .cwd, .np, &.{ .o16 }, &.{ 0x99 }, 0, .none }, - .{ .cdq, .np, &.{ .o32 }, &.{ 0x99 }, 0, .none }, - .{ .cqo, .np, &.{ .o64 }, &.{ 0x99 }, 0, .long }, - - .{ .cmova, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .none }, - .{ .cmova, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none }, - .{ .cmova, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long }, - .{ .cmovae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none }, - .{ .cmovae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none }, - .{ .cmovae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long }, - .{ .cmovb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none }, - .{ .cmovb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none }, - .{ .cmovb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long }, - .{ .cmovbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .none }, - .{ .cmovbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none }, - .{ .cmovbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long }, - .{ .cmovc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none }, - .{ .cmovc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none }, - .{ .cmovc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long }, - .{ .cmove, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .none }, - .{ .cmove, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none }, - .{ .cmove, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long }, - .{ .cmovg, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .none }, - .{ .cmovg, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none }, - .{ .cmovg, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long }, - .{ .cmovge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .none }, - .{ .cmovge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none }, - .{ .cmovge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long }, - .{ .cmovl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .none }, - .{ .cmovl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none }, - .{ .cmovl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long }, - .{ .cmovle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .none }, - .{ .cmovle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none }, - .{ .cmovle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long }, - .{ .cmovna, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .none }, - .{ .cmovna, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none }, - .{ .cmovna, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long }, - .{ .cmovnae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none }, - .{ .cmovnae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none }, - .{ .cmovnae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long }, - .{ .cmovnb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none }, - .{ .cmovnb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none }, - .{ .cmovnb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long }, - .{ .cmovnbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .none }, - .{ .cmovnbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none }, - .{ .cmovnbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long }, - .{ .cmovnc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none }, - .{ .cmovnc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none }, - .{ .cmovnc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long }, - .{ .cmovne, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .none }, - .{ .cmovne, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none }, - .{ .cmovne, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long }, - .{ .cmovng, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .none }, - .{ .cmovng, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none }, - .{ .cmovng, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long }, - .{ .cmovnge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .none }, - .{ .cmovnge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none }, - .{ .cmovnge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long }, - .{ .cmovnl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .none }, - .{ .cmovnl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none }, - .{ .cmovnl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long }, - .{ .cmovnle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .none }, - .{ .cmovnle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none }, - .{ .cmovnle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long }, - .{ .cmovno, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x41 }, 0, .none }, - .{ .cmovno, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x41 }, 0, .none }, - .{ .cmovno, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x41 }, 0, .long }, - .{ .cmovnp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .none }, - .{ .cmovnp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none }, - .{ .cmovnp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long }, - .{ .cmovns, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x49 }, 0, .none }, - .{ .cmovns, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x49 }, 0, .none }, - .{ .cmovns, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x49 }, 0, .long }, - .{ .cmovnz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .none }, - .{ .cmovnz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none }, - .{ .cmovnz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long }, - .{ .cmovo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x40 }, 0, .none }, - .{ .cmovo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x40 }, 0, .none }, - .{ .cmovo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x40 }, 0, .long }, - .{ .cmovp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .none }, - .{ .cmovp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none }, - .{ .cmovp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long }, - .{ .cmovpe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .none }, - .{ .cmovpe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none }, - .{ .cmovpe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long }, - .{ .cmovpo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .none }, - .{ .cmovpo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none }, - .{ .cmovpo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long }, - .{ .cmovs, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x48 }, 0, .none }, - .{ .cmovs, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x48 }, 0, .none }, - .{ .cmovs, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x48 }, 0, .long }, - .{ .cmovz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .none }, - .{ .cmovz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none }, - .{ .cmovz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long }, - - .{ .cmp, .zi, &.{ .al, .imm8 }, &.{ 0x3c }, 0, .none }, - .{ .cmp, .zi, &.{ .ax, .imm16 }, &.{ 0x3d }, 0, .none }, - .{ .cmp, .zi, &.{ .eax, .imm32 }, &.{ 0x3d }, 0, .none }, - .{ .cmp, .zi, &.{ .rax, .imm32s }, &.{ 0x3d }, 0, .long }, - .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .none }, - .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .rex }, - .{ .cmp, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 7, .none }, - .{ .cmp, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 7, .none }, - .{ .cmp, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 7, .long }, - .{ .cmp, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 7, .none }, - .{ .cmp, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 7, .none }, - .{ .cmp, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 7, .long }, - .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .none }, - .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .rex }, - .{ .cmp, .mr, &.{ .rm16, .r16 }, &.{ 0x39 }, 0, .none }, - .{ .cmp, .mr, &.{ .rm32, .r32 }, &.{ 0x39 }, 0, .none }, - .{ .cmp, .mr, &.{ .rm64, .r64 }, &.{ 0x39 }, 0, .long }, - .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .none }, - .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .rex }, - .{ .cmp, .rm, &.{ .r16, .rm16 }, &.{ 0x3b }, 0, .none }, - .{ .cmp, .rm, &.{ .r32, .rm32 }, &.{ 0x3b }, 0, .none }, - .{ .cmp, .rm, &.{ .r64, .rm64 }, &.{ 0x3b }, 0, .long }, - - .{ .cmps, .np, &.{ .m8, .m8 }, &.{ 0xa6 }, 0, .none }, - .{ .cmps, .np, &.{ .m16, .m16 }, &.{ 0xa7 }, 0, .none }, - .{ .cmps, .np, &.{ .m32, .m32 }, &.{ 0xa7 }, 0, .none }, - .{ .cmps, .np, &.{ .m64, .m64 }, &.{ 0xa7 }, 0, .long }, - - .{ .cmpsb, .np, &.{}, &.{ 0xa6 }, 0, .none }, - .{ .cmpsw, .np, &.{}, &.{ 0xa7 }, 0, .short }, - .{ .cmpsd, .np, &.{}, &.{ 0xa7 }, 0, .none }, - .{ .cmpsq, .np, &.{}, &.{ 0xa7 }, 0, .long }, - - .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .none }, - .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .rex }, - .{ .cmpxchg, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb1 }, 0, .none }, - .{ .cmpxchg, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb1 }, 0, .none }, - .{ .cmpxchg, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb1 }, 0, .long }, - - .{ .cmpxchg8b , .m, &.{ .m64 }, &.{ 0x0f, 0xc7 }, 1, .none }, - .{ .cmpxchg16b, .m, &.{ .m128 }, &.{ 0x0f, 0xc7 }, 1, .long }, - - .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .none }, - .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .rex }, - .{ .div, .m, &.{ .rm16 }, &.{ 0xf7 }, 6, .none }, - .{ .div, .m, &.{ .rm32 }, &.{ 0xf7 }, 6, .none }, - .{ .div, .m, &.{ .rm64 }, &.{ 0xf7 }, 6, .long }, - - .{ .fisttp, .m, &.{ .m16 }, &.{ 0xdf }, 1, .fpu }, - .{ .fisttp, .m, &.{ .m32 }, &.{ 0xdb }, 1, .fpu }, - .{ .fisttp, .m, &.{ .m64 }, &.{ 0xdd }, 1, .fpu }, - - .{ .fld, .m, &.{ .m32 }, &.{ 0xd9 }, 0, .fpu }, - .{ .fld, .m, &.{ .m64 }, &.{ 0xdd }, 0, .fpu }, - .{ .fld, .m, &.{ .m80 }, &.{ 0xdb }, 5, .fpu }, - - .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .none }, - .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .rex }, - .{ .idiv, .m, &.{ .rm16 }, &.{ 0xf7 }, 7, .none }, - .{ .idiv, .m, &.{ .rm32 }, &.{ 0xf7 }, 7, .none }, - .{ .idiv, .m, &.{ .rm64 }, &.{ 0xf7 }, 7, .long }, - - .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .none }, - .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .rex }, - .{ .imul, .m, &.{ .rm16, }, &.{ 0xf7 }, 5, .none }, - .{ .imul, .m, &.{ .rm32, }, &.{ 0xf7 }, 5, .none }, - .{ .imul, .m, &.{ .rm64, }, &.{ 0xf7 }, 5, .long }, - .{ .imul, .rm, &.{ .r16, .rm16, }, &.{ 0x0f, 0xaf }, 0, .none }, - .{ .imul, .rm, &.{ .r32, .rm32, }, &.{ 0x0f, 0xaf }, 0, .none }, - .{ .imul, .rm, &.{ .r64, .rm64, }, &.{ 0x0f, 0xaf }, 0, .long }, - .{ .imul, .rmi, &.{ .r16, .rm16, .imm8s }, &.{ 0x6b }, 0, .none }, - .{ .imul, .rmi, &.{ .r32, .rm32, .imm8s }, &.{ 0x6b }, 0, .none }, - .{ .imul, .rmi, &.{ .r64, .rm64, .imm8s }, &.{ 0x6b }, 0, .long }, - .{ .imul, .rmi, &.{ .r16, .rm16, .imm16 }, &.{ 0x69 }, 0, .none }, - .{ .imul, .rmi, &.{ .r32, .rm32, .imm32 }, &.{ 0x69 }, 0, .none }, - .{ .imul, .rmi, &.{ .r64, .rm64, .imm32 }, &.{ 0x69 }, 0, .long }, - - .{ .int3, .np, &.{}, &.{ 0xcc }, 0, .none }, - - .{ .ja, .d, &.{ .rel32 }, &.{ 0x0f, 0x87 }, 0, .none }, - .{ .jae, .d, &.{ .rel32 }, &.{ 0x0f, 0x83 }, 0, .none }, - .{ .jb, .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none }, - .{ .jbe, .d, &.{ .rel32 }, &.{ 0x0f, 0x86 }, 0, .none }, - .{ .jc, .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none }, - .{ .jrcxz, .d, &.{ .rel32 }, &.{ 0xe3 }, 0, .none }, - .{ .je, .d, &.{ .rel32 }, &.{ 0x0f, 0x84 }, 0, .none }, - .{ .jg, .d, &.{ .rel32 }, &.{ 0x0f, 0x8f }, 0, .none }, - .{ .jge, .d, &.{ .rel32 }, &.{ 0x0f, 0x8d }, 0, .none }, - .{ .jl, .d, &.{ .rel32 }, &.{ 0x0f, 0x8c }, 0, .none }, - .{ .jle, .d, &.{ .rel32 }, &.{ 0x0f, 0x8e }, 0, .none }, - .{ .jna, .d, &.{ .rel32 }, &.{ 0x0f, 0x86 }, 0, .none }, - .{ .jnae, .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none }, - .{ .jnb, .d, &.{ .rel32 }, &.{ 0x0f, 0x83 }, 0, .none }, - .{ .jnbe, .d, &.{ .rel32 }, &.{ 0x0f, 0x87 }, 0, .none }, - .{ .jnc, .d, &.{ .rel32 }, &.{ 0x0f, 0x83 }, 0, .none }, - .{ .jne, .d, &.{ .rel32 }, &.{ 0x0f, 0x85 }, 0, .none }, - .{ .jng, .d, &.{ .rel32 }, &.{ 0x0f, 0x8e }, 0, .none }, - .{ .jnge, .d, &.{ .rel32 }, &.{ 0x0f, 0x8c }, 0, .none }, - .{ .jnl, .d, &.{ .rel32 }, &.{ 0x0f, 0x8d }, 0, .none }, - .{ .jnle, .d, &.{ .rel32 }, &.{ 0x0f, 0x8f }, 0, .none }, - .{ .jno, .d, &.{ .rel32 }, &.{ 0x0f, 0x81 }, 0, .none }, - .{ .jnp, .d, &.{ .rel32 }, &.{ 0x0f, 0x8b }, 0, .none }, - .{ .jns, .d, &.{ .rel32 }, &.{ 0x0f, 0x89 }, 0, .none }, - .{ .jnz, .d, &.{ .rel32 }, &.{ 0x0f, 0x85 }, 0, .none }, - .{ .jo, .d, &.{ .rel32 }, &.{ 0x0f, 0x80 }, 0, .none }, - .{ .jp, .d, &.{ .rel32 }, &.{ 0x0f, 0x8a }, 0, .none }, - .{ .jpe, .d, &.{ .rel32 }, &.{ 0x0f, 0x8a }, 0, .none }, - .{ .jpo, .d, &.{ .rel32 }, &.{ 0x0f, 0x8b }, 0, .none }, - .{ .js, .d, &.{ .rel32 }, &.{ 0x0f, 0x88 }, 0, .none }, - .{ .jz, .d, &.{ .rel32 }, &.{ 0x0f, 0x84 }, 0, .none }, - - .{ .jmp, .d, &.{ .rel32 }, &.{ 0xe9 }, 0, .none }, - .{ .jmp, .m, &.{ .rm64 }, &.{ 0xff }, 4, .none }, - - .{ .lea, .rm, &.{ .r16, .m }, &.{ 0x8d }, 0, .none }, - .{ .lea, .rm, &.{ .r32, .m }, &.{ 0x8d }, 0, .none }, - .{ .lea, .rm, &.{ .r64, .m }, &.{ 0x8d }, 0, .long }, - - .{ .lfence, .np, &.{}, &.{ 0x0f, 0xae, 0xe8 }, 0, .none }, - - .{ .lods, .np, &.{ .m8 }, &.{ 0xac }, 0, .none }, - .{ .lods, .np, &.{ .m16 }, &.{ 0xad }, 0, .none }, - .{ .lods, .np, &.{ .m32 }, &.{ 0xad }, 0, .none }, - .{ .lods, .np, &.{ .m64 }, &.{ 0xad }, 0, .long }, - - .{ .lodsb, .np, &.{}, &.{ 0xac }, 0, .none }, - .{ .lodsw, .np, &.{}, &.{ 0xad }, 0, .short }, - .{ .lodsd, .np, &.{}, &.{ 0xad }, 0, .none }, - .{ .lodsq, .np, &.{}, &.{ 0xad }, 0, .long }, - - .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none }, - .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none }, - .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long }, - - .{ .mfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf0 }, 0, .none }, - - .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .none }, - .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .rex }, - .{ .mov, .mr, &.{ .rm16, .r16 }, &.{ 0x89 }, 0, .none }, - .{ .mov, .mr, &.{ .rm32, .r32 }, &.{ 0x89 }, 0, .none }, - .{ .mov, .mr, &.{ .rm64, .r64 }, &.{ 0x89 }, 0, .long }, - .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .none }, - .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .rex }, - .{ .mov, .rm, &.{ .r16, .rm16 }, &.{ 0x8b }, 0, .none }, - .{ .mov, .rm, &.{ .r32, .rm32 }, &.{ 0x8b }, 0, .none }, - .{ .mov, .rm, &.{ .r64, .rm64 }, &.{ 0x8b }, 0, .long }, - .{ .mov, .mr, &.{ .rm16, .sreg }, &.{ 0x8c }, 0, .none }, - .{ .mov, .mr, &.{ .rm64, .sreg }, &.{ 0x8c }, 0, .long }, - .{ .mov, .rm, &.{ .sreg, .rm16 }, &.{ 0x8e }, 0, .none }, - .{ .mov, .rm, &.{ .sreg, .rm64 }, &.{ 0x8e }, 0, .long }, - .{ .mov, .fd, &.{ .al, .moffs }, &.{ 0xa0 }, 0, .none }, - .{ .mov, .fd, &.{ .ax, .moffs }, &.{ 0xa1 }, 0, .none }, - .{ .mov, .fd, &.{ .eax, .moffs }, &.{ 0xa1 }, 0, .none }, - .{ .mov, .fd, &.{ .rax, .moffs }, &.{ 0xa1 }, 0, .long }, - .{ .mov, .td, &.{ .moffs, .al }, &.{ 0xa2 }, 0, .none }, - .{ .mov, .td, &.{ .moffs, .ax }, &.{ 0xa3 }, 0, .none }, - .{ .mov, .td, &.{ .moffs, .eax }, &.{ 0xa3 }, 0, .none }, - .{ .mov, .td, &.{ .moffs, .rax }, &.{ 0xa3 }, 0, .long }, - .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .none }, - .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .rex }, - .{ .mov, .oi, &.{ .r16, .imm16 }, &.{ 0xb8 }, 0, .none }, - .{ .mov, .oi, &.{ .r32, .imm32 }, &.{ 0xb8 }, 0, .none }, - .{ .mov, .oi, &.{ .r64, .imm64 }, &.{ 0xb8 }, 0, .long }, - .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .none }, - .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .rex }, - .{ .mov, .mi, &.{ .rm16, .imm16 }, &.{ 0xc7 }, 0, .none }, - .{ .mov, .mi, &.{ .rm32, .imm32 }, &.{ 0xc7 }, 0, .none }, - .{ .mov, .mi, &.{ .rm64, .imm32s }, &.{ 0xc7 }, 0, .long }, - - .{ .movbe, .rm, &.{ .r16, .m16 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none }, - .{ .movbe, .rm, &.{ .r32, .m32 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none }, - .{ .movbe, .rm, &.{ .r64, .m64 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .long }, - .{ .movbe, .mr, &.{ .m16, .r16 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none }, - .{ .movbe, .mr, &.{ .m32, .r32 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none }, - .{ .movbe, .mr, &.{ .m64, .r64 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .long }, - - .{ .movs, .np, &.{ .m8, .m8 }, &.{ 0xa4 }, 0, .none }, - .{ .movs, .np, &.{ .m16, .m16 }, &.{ 0xa5 }, 0, .none }, - .{ .movs, .np, &.{ .m32, .m32 }, &.{ 0xa5 }, 0, .none }, - .{ .movs, .np, &.{ .m64, .m64 }, &.{ 0xa5 }, 0, .long }, - - .{ .movsb, .np, &.{}, &.{ 0xa4 }, 0, .none }, - .{ .movsw, .np, &.{}, &.{ 0xa5 }, 0, .short }, - .{ .movsd, .np, &.{}, &.{ 0xa5 }, 0, .none }, - .{ .movsq, .np, &.{}, &.{ 0xa5 }, 0, .long }, - - .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none }, - .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex }, - .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none }, - .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex }, - .{ .movsx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xbe }, 0, .long }, - .{ .movsx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xbf }, 0, .none }, - .{ .movsx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xbf }, 0, .long }, + .{ .call, .d, &.{ .rel32 }, &.{ 0xe8 }, 0, .none, .none }, + .{ .call, .m, &.{ .rm64 }, &.{ 0xff }, 2, .none, .none }, + + .{ .cbw, .np, &.{ .o16 }, &.{ 0x98 }, 0, .none, .none }, + .{ .cwde, .np, &.{ .o32 }, &.{ 0x98 }, 0, .none, .none }, + .{ .cdqe, .np, &.{ .o64 }, &.{ 0x98 }, 0, .long, .none }, + + .{ .cwd, .np, &.{ .o16 }, &.{ 0x99 }, 0, .none, .none }, + .{ .cdq, .np, &.{ .o32 }, &.{ 0x99 }, 0, .none, .none }, + .{ .cqo, .np, &.{ .o64 }, &.{ 0x99 }, 0, .long, .none }, + + .{ .cmova, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, + .{ .cmova, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, + .{ .cmova, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long, .none }, + .{ .cmovae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, + .{ .cmovb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, + .{ .cmovbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, + .{ .cmovbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, + .{ .cmovbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long, .none }, + .{ .cmovc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, + .{ .cmove, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, + .{ .cmove, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, + .{ .cmove, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long, .none }, + .{ .cmovg, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, + .{ .cmovg, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, + .{ .cmovg, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long, .none }, + .{ .cmovge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, + .{ .cmovge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, + .{ .cmovge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long, .none }, + .{ .cmovl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, + .{ .cmovl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, + .{ .cmovl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long, .none }, + .{ .cmovle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, + .{ .cmovle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, + .{ .cmovle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long, .none }, + .{ .cmovna, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, + .{ .cmovna, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, + .{ .cmovna, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long, .none }, + .{ .cmovnae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovnae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovnae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, + .{ .cmovnb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovnb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovnb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, + .{ .cmovnbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, + .{ .cmovnbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, + .{ .cmovnbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long, .none }, + .{ .cmovnc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovnc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovnc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, + .{ .cmovne, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, + .{ .cmovne, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, + .{ .cmovne, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long, .none }, + .{ .cmovng, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, + .{ .cmovng, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, + .{ .cmovng, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long, .none }, + .{ .cmovnge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, + .{ .cmovnge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, + .{ .cmovnge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long, .none }, + .{ .cmovnl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, + .{ .cmovnl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, + .{ .cmovnl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long, .none }, + .{ .cmovnle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, + .{ .cmovnle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, + .{ .cmovnle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long, .none }, + .{ .cmovno, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x41 }, 0, .none, .none }, + .{ .cmovno, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x41 }, 0, .none, .none }, + .{ .cmovno, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x41 }, 0, .long, .none }, + .{ .cmovnp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, + .{ .cmovnp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, + .{ .cmovnp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long, .none }, + .{ .cmovns, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x49 }, 0, .none, .none }, + .{ .cmovns, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x49 }, 0, .none, .none }, + .{ .cmovns, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x49 }, 0, .long, .none }, + .{ .cmovnz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, + .{ .cmovnz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, + .{ .cmovnz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long, .none }, + .{ .cmovo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x40 }, 0, .none, .none }, + .{ .cmovo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x40 }, 0, .none, .none }, + .{ .cmovo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x40 }, 0, .long, .none }, + .{ .cmovp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, + .{ .cmovp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, + .{ .cmovp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long, .none }, + .{ .cmovpe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, + .{ .cmovpe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, + .{ .cmovpe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long, .none }, + .{ .cmovpo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, + .{ .cmovpo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, + .{ .cmovpo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long, .none }, + .{ .cmovs, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x48 }, 0, .none, .none }, + .{ .cmovs, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x48 }, 0, .none, .none }, + .{ .cmovs, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x48 }, 0, .long, .none }, + .{ .cmovz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, + .{ .cmovz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, + .{ .cmovz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long, .none }, + + .{ .cmp, .zi, &.{ .al, .imm8 }, &.{ 0x3c }, 0, .none, .none }, + .{ .cmp, .zi, &.{ .ax, .imm16 }, &.{ 0x3d }, 0, .none, .none }, + .{ .cmp, .zi, &.{ .eax, .imm32 }, &.{ 0x3d }, 0, .none, .none }, + .{ .cmp, .zi, &.{ .rax, .imm32s }, &.{ 0x3d }, 0, .long, .none }, + .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .rex, .none }, + .{ .cmp, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 7, .long, .none }, + .{ .cmp, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 7, .long, .none }, + .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .none, .none }, + .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .rex, .none }, + .{ .cmp, .mr, &.{ .rm16, .r16 }, &.{ 0x39 }, 0, .none, .none }, + .{ .cmp, .mr, &.{ .rm32, .r32 }, &.{ 0x39 }, 0, .none, .none }, + .{ .cmp, .mr, &.{ .rm64, .r64 }, &.{ 0x39 }, 0, .long, .none }, + .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .none, .none }, + .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .rex, .none }, + .{ .cmp, .rm, &.{ .r16, .rm16 }, &.{ 0x3b }, 0, .none, .none }, + .{ .cmp, .rm, &.{ .r32, .rm32 }, &.{ 0x3b }, 0, .none, .none }, + .{ .cmp, .rm, &.{ .r64, .rm64 }, &.{ 0x3b }, 0, .long, .none }, + + .{ .cmps, .np, &.{ .m8, .m8 }, &.{ 0xa6 }, 0, .none, .none }, + .{ .cmps, .np, &.{ .m16, .m16 }, &.{ 0xa7 }, 0, .none, .none }, + .{ .cmps, .np, &.{ .m32, .m32 }, &.{ 0xa7 }, 0, .none, .none }, + .{ .cmps, .np, &.{ .m64, .m64 }, &.{ 0xa7 }, 0, .long, .none }, + + .{ .cmpsb, .np, &.{}, &.{ 0xa6 }, 0, .none, .none }, + .{ .cmpsw, .np, &.{}, &.{ 0xa7 }, 0, .short, .none }, + .{ .cmpsd, .np, &.{}, &.{ 0xa7 }, 0, .none, .none }, + .{ .cmpsq, .np, &.{}, &.{ 0xa7 }, 0, .long, .none }, + + .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .none, .none }, + .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .rex, .none }, + .{ .cmpxchg, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb1 }, 0, .none, .none }, + .{ .cmpxchg, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb1 }, 0, .none, .none }, + .{ .cmpxchg, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb1 }, 0, .long, .none }, + + .{ .cmpxchg8b, .m, &.{ .m64 }, &.{ 0x0f, 0xc7 }, 1, .none, .none }, + .{ .cmpxchg16b, .m, &.{ .m128 }, &.{ 0x0f, 0xc7 }, 1, .long, .none }, + + .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .none, .none }, + .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .rex, .none }, + .{ .div, .m, &.{ .rm16 }, &.{ 0xf7 }, 6, .none, .none }, + .{ .div, .m, &.{ .rm32 }, &.{ 0xf7 }, 6, .none, .none }, + .{ .div, .m, &.{ .rm64 }, &.{ 0xf7 }, 6, .long, .none }, + + .{ .fisttp, .m, &.{ .m16 }, &.{ 0xdf }, 1, .none, .x87 }, + .{ .fisttp, .m, &.{ .m32 }, &.{ 0xdb }, 1, .none, .x87 }, + .{ .fisttp, .m, &.{ .m64 }, &.{ 0xdd }, 1, .none, .x87 }, + + .{ .fld, .m, &.{ .m32 }, &.{ 0xd9 }, 0, .none, .x87 }, + .{ .fld, .m, &.{ .m64 }, &.{ 0xdd }, 0, .none, .x87 }, + .{ .fld, .m, &.{ .m80 }, &.{ 0xdb }, 5, .none, .x87 }, + + .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .none, .none }, + .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .rex, .none }, + .{ .idiv, .m, &.{ .rm16 }, &.{ 0xf7 }, 7, .none, .none }, + .{ .idiv, .m, &.{ .rm32 }, &.{ 0xf7 }, 7, .none, .none }, + .{ .idiv, .m, &.{ .rm64 }, &.{ 0xf7 }, 7, .long, .none }, + + .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .none, .none }, + .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .rex, .none }, + .{ .imul, .m, &.{ .rm16, }, &.{ 0xf7 }, 5, .none, .none }, + .{ .imul, .m, &.{ .rm32, }, &.{ 0xf7 }, 5, .none, .none }, + .{ .imul, .m, &.{ .rm64, }, &.{ 0xf7 }, 5, .long, .none }, + .{ .imul, .rm, &.{ .r16, .rm16, }, &.{ 0x0f, 0xaf }, 0, .none, .none }, + .{ .imul, .rm, &.{ .r32, .rm32, }, &.{ 0x0f, 0xaf }, 0, .none, .none }, + .{ .imul, .rm, &.{ .r64, .rm64, }, &.{ 0x0f, 0xaf }, 0, .long, .none }, + .{ .imul, .rmi, &.{ .r16, .rm16, .imm8s }, &.{ 0x6b }, 0, .none, .none }, + .{ .imul, .rmi, &.{ .r32, .rm32, .imm8s }, &.{ 0x6b }, 0, .none, .none }, + .{ .imul, .rmi, &.{ .r64, .rm64, .imm8s }, &.{ 0x6b }, 0, .long, .none }, + .{ .imul, .rmi, &.{ .r16, .rm16, .imm16 }, &.{ 0x69 }, 0, .none, .none }, + .{ .imul, .rmi, &.{ .r32, .rm32, .imm32 }, &.{ 0x69 }, 0, .none, .none }, + .{ .imul, .rmi, &.{ .r64, .rm64, .imm32 }, &.{ 0x69 }, 0, .long, .none }, + + .{ .int3, .np, &.{}, &.{ 0xcc }, 0, .none, .none }, + + .{ .ja, .d, &.{ .rel32 }, &.{ 0x0f, 0x87 }, 0, .none, .none }, + .{ .jae, .d, &.{ .rel32 }, &.{ 0x0f, 0x83 }, 0, .none, .none }, + .{ .jb, .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none, .none }, + .{ .jbe, .d, &.{ .rel32 }, &.{ 0x0f, 0x86 }, 0, .none, .none }, + .{ .jc, .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none, .none }, + .{ .jrcxz, .d, &.{ .rel32 }, &.{ 0xe3 }, 0, .none, .none }, + .{ .je, .d, &.{ .rel32 }, &.{ 0x0f, 0x84 }, 0, .none, .none }, + .{ .jg, .d, &.{ .rel32 }, &.{ 0x0f, 0x8f }, 0, .none, .none }, + .{ .jge, .d, &.{ .rel32 }, &.{ 0x0f, 0x8d }, 0, .none, .none }, + .{ .jl, .d, &.{ .rel32 }, &.{ 0x0f, 0x8c }, 0, .none, .none }, + .{ .jle, .d, &.{ .rel32 }, &.{ 0x0f, 0x8e }, 0, .none, .none }, + .{ .jna, .d, &.{ .rel32 }, &.{ 0x0f, 0x86 }, 0, .none, .none }, + .{ .jnae, .d, &.{ .rel32 }, &.{ 0x0f, 0x82 }, 0, .none, .none }, + .{ .jnb, .d, &.{ .rel32 }, &.{ 0x0f, 0x83 }, 0, .none, .none }, + .{ .jnbe, .d, &.{ .rel32 }, &.{ 0x0f, 0x87 }, 0, .none, .none }, + .{ .jnc, .d, &.{ .rel32 }, &.{ 0x0f, 0x83 }, 0, .none, .none }, + .{ .jne, .d, &.{ .rel32 }, &.{ 0x0f, 0x85 }, 0, .none, .none }, + .{ .jng, .d, &.{ .rel32 }, &.{ 0x0f, 0x8e }, 0, .none, .none }, + .{ .jnge, .d, &.{ .rel32 }, &.{ 0x0f, 0x8c }, 0, .none, .none }, + .{ .jnl, .d, &.{ .rel32 }, &.{ 0x0f, 0x8d }, 0, .none, .none }, + .{ .jnle, .d, &.{ .rel32 }, &.{ 0x0f, 0x8f }, 0, .none, .none }, + .{ .jno, .d, &.{ .rel32 }, &.{ 0x0f, 0x81 }, 0, .none, .none }, + .{ .jnp, .d, &.{ .rel32 }, &.{ 0x0f, 0x8b }, 0, .none, .none }, + .{ .jns, .d, &.{ .rel32 }, &.{ 0x0f, 0x89 }, 0, .none, .none }, + .{ .jnz, .d, &.{ .rel32 }, &.{ 0x0f, 0x85 }, 0, .none, .none }, + .{ .jo, .d, &.{ .rel32 }, &.{ 0x0f, 0x80 }, 0, .none, .none }, + .{ .jp, .d, &.{ .rel32 }, &.{ 0x0f, 0x8a }, 0, .none, .none }, + .{ .jpe, .d, &.{ .rel32 }, &.{ 0x0f, 0x8a }, 0, .none, .none }, + .{ .jpo, .d, &.{ .rel32 }, &.{ 0x0f, 0x8b }, 0, .none, .none }, + .{ .js, .d, &.{ .rel32 }, &.{ 0x0f, 0x88 }, 0, .none, .none }, + .{ .jz, .d, &.{ .rel32 }, &.{ 0x0f, 0x84 }, 0, .none, .none }, + + .{ .jmp, .d, &.{ .rel32 }, &.{ 0xe9 }, 0, .none, .none }, + .{ .jmp, .m, &.{ .rm64 }, &.{ 0xff }, 4, .none, .none }, + + .{ .lea, .rm, &.{ .r16, .m }, &.{ 0x8d }, 0, .none, .none }, + .{ .lea, .rm, &.{ .r32, .m }, &.{ 0x8d }, 0, .none, .none }, + .{ .lea, .rm, &.{ .r64, .m }, &.{ 0x8d }, 0, .long, .none }, + + .{ .lfence, .np, &.{}, &.{ 0x0f, 0xae, 0xe8 }, 0, .none, .none }, + + .{ .lods, .np, &.{ .m8 }, &.{ 0xac }, 0, .none, .none }, + .{ .lods, .np, &.{ .m16 }, &.{ 0xad }, 0, .none, .none }, + .{ .lods, .np, &.{ .m32 }, &.{ 0xad }, 0, .none, .none }, + .{ .lods, .np, &.{ .m64 }, &.{ 0xad }, 0, .long, .none }, + + .{ .lodsb, .np, &.{}, &.{ 0xac }, 0, .none, .none }, + .{ .lodsw, .np, &.{}, &.{ 0xad }, 0, .short, .none }, + .{ .lodsd, .np, &.{}, &.{ 0xad }, 0, .none, .none }, + .{ .lodsq, .np, &.{}, &.{ 0xad }, 0, .long, .none }, + + .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .none }, + .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .none }, + .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long, .none }, + + .{ .mfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf0 }, 0, .none, .none }, + + .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .none, .none }, + .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .rex, .none }, + .{ .mov, .mr, &.{ .rm16, .r16 }, &.{ 0x89 }, 0, .none, .none }, + .{ .mov, .mr, &.{ .rm32, .r32 }, &.{ 0x89 }, 0, .none, .none }, + .{ .mov, .mr, &.{ .rm64, .r64 }, &.{ 0x89 }, 0, .long, .none }, + .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .none, .none }, + .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .rex, .none }, + .{ .mov, .rm, &.{ .r16, .rm16 }, &.{ 0x8b }, 0, .none, .none }, + .{ .mov, .rm, &.{ .r32, .rm32 }, &.{ 0x8b }, 0, .none, .none }, + .{ .mov, .rm, &.{ .r64, .rm64 }, &.{ 0x8b }, 0, .long, .none }, + .{ .mov, .mr, &.{ .rm16, .sreg }, &.{ 0x8c }, 0, .none, .none }, + .{ .mov, .mr, &.{ .rm64, .sreg }, &.{ 0x8c }, 0, .long, .none }, + .{ .mov, .rm, &.{ .sreg, .rm16 }, &.{ 0x8e }, 0, .none, .none }, + .{ .mov, .rm, &.{ .sreg, .rm64 }, &.{ 0x8e }, 0, .long, .none }, + .{ .mov, .fd, &.{ .al, .moffs }, &.{ 0xa0 }, 0, .none, .none }, + .{ .mov, .fd, &.{ .ax, .moffs }, &.{ 0xa1 }, 0, .none, .none }, + .{ .mov, .fd, &.{ .eax, .moffs }, &.{ 0xa1 }, 0, .none, .none }, + .{ .mov, .fd, &.{ .rax, .moffs }, &.{ 0xa1 }, 0, .long, .none }, + .{ .mov, .td, &.{ .moffs, .al }, &.{ 0xa2 }, 0, .none, .none }, + .{ .mov, .td, &.{ .moffs, .ax }, &.{ 0xa3 }, 0, .none, .none }, + .{ .mov, .td, &.{ .moffs, .eax }, &.{ 0xa3 }, 0, .none, .none }, + .{ .mov, .td, &.{ .moffs, .rax }, &.{ 0xa3 }, 0, .long, .none }, + .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .none, .none }, + .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .rex, .none }, + .{ .mov, .oi, &.{ .r16, .imm16 }, &.{ 0xb8 }, 0, .none, .none }, + .{ .mov, .oi, &.{ .r32, .imm32 }, &.{ 0xb8 }, 0, .none, .none }, + .{ .mov, .oi, &.{ .r64, .imm64 }, &.{ 0xb8 }, 0, .long, .none }, + .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .none, .none }, + .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .rex, .none }, + .{ .mov, .mi, &.{ .rm16, .imm16 }, &.{ 0xc7 }, 0, .none, .none }, + .{ .mov, .mi, &.{ .rm32, .imm32 }, &.{ 0xc7 }, 0, .none, .none }, + .{ .mov, .mi, &.{ .rm64, .imm32s }, &.{ 0xc7 }, 0, .long, .none }, + + .{ .movbe, .rm, &.{ .r16, .m16 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .none }, + .{ .movbe, .rm, &.{ .r32, .m32 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .none }, + .{ .movbe, .rm, &.{ .r64, .m64 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .long, .none }, + .{ .movbe, .mr, &.{ .m16, .r16 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .none }, + .{ .movbe, .mr, &.{ .m32, .r32 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .none }, + .{ .movbe, .mr, &.{ .m64, .r64 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .long, .none }, + + .{ .movs, .np, &.{ .m8, .m8 }, &.{ 0xa4 }, 0, .none, .none }, + .{ .movs, .np, &.{ .m16, .m16 }, &.{ 0xa5 }, 0, .none, .none }, + .{ .movs, .np, &.{ .m32, .m32 }, &.{ 0xa5 }, 0, .none, .none }, + .{ .movs, .np, &.{ .m64, .m64 }, &.{ 0xa5 }, 0, .long, .none }, + + .{ .movsb, .np, &.{}, &.{ 0xa4 }, 0, .none, .none }, + .{ .movsw, .np, &.{}, &.{ 0xa5 }, 0, .short, .none }, + .{ .movsd, .np, &.{}, &.{ 0xa5 }, 0, .none, .none }, + .{ .movsq, .np, &.{}, &.{ 0xa5 }, 0, .long, .none }, + + .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none, .none }, + .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex, .none }, + .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none, .none }, + .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex, .none }, + .{ .movsx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xbe }, 0, .long, .none }, + .{ .movsx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xbf }, 0, .none, .none }, + .{ .movsx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xbf }, 0, .long, .none }, // This instruction is discouraged. - .{ .movsxd, .rm, &.{ .r32, .rm32 }, &.{ 0x63 }, 0, .none }, - .{ .movsxd, .rm, &.{ .r64, .rm32 }, &.{ 0x63 }, 0, .long }, - - .{ .movzx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none }, - .{ .movzx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none }, - .{ .movzx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .long }, - .{ .movzx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .none }, - .{ .movzx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .long }, - - .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .none }, - .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .rex }, - .{ .mul, .m, &.{ .rm16 }, &.{ 0xf7 }, 4, .none }, - .{ .mul, .m, &.{ .rm32 }, &.{ 0xf7 }, 4, .none }, - .{ .mul, .m, &.{ .rm64 }, &.{ 0xf7 }, 4, .long }, - - .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .none }, - .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .rex }, - .{ .neg, .m, &.{ .rm16 }, &.{ 0xf7 }, 3, .none }, - .{ .neg, .m, &.{ .rm32 }, &.{ 0xf7 }, 3, .none }, - .{ .neg, .m, &.{ .rm64 }, &.{ 0xf7 }, 3, .long }, - - .{ .nop, .np, &.{}, &.{ 0x90 }, 0, .none }, - - .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .none }, - .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .rex }, - .{ .not, .m, &.{ .rm16 }, &.{ 0xf7 }, 2, .none }, - .{ .not, .m, &.{ .rm32 }, &.{ 0xf7 }, 2, .none }, - .{ .not, .m, &.{ .rm64 }, &.{ 0xf7 }, 2, .long }, - - .{ .@"or", .zi, &.{ .al, .imm8 }, &.{ 0x0c }, 0, .none }, - .{ .@"or", .zi, &.{ .ax, .imm16 }, &.{ 0x0d }, 0, .none }, - .{ .@"or", .zi, &.{ .eax, .imm32 }, &.{ 0x0d }, 0, .none }, - .{ .@"or", .zi, &.{ .rax, .imm32s }, &.{ 0x0d }, 0, .long }, - .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .none }, - .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .rex }, - .{ .@"or", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 1, .none }, - .{ .@"or", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 1, .none }, - .{ .@"or", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 1, .long }, - .{ .@"or", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 1, .none }, - .{ .@"or", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 1, .none }, - .{ .@"or", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 1, .long }, - .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .none }, - .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .rex }, - .{ .@"or", .mr, &.{ .rm16, .r16 }, &.{ 0x09 }, 0, .none }, - .{ .@"or", .mr, &.{ .rm32, .r32 }, &.{ 0x09 }, 0, .none }, - .{ .@"or", .mr, &.{ .rm64, .r64 }, &.{ 0x09 }, 0, .long }, - .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .none }, - .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .rex }, - .{ .@"or", .rm, &.{ .r16, .rm16 }, &.{ 0x0b }, 0, .none }, - .{ .@"or", .rm, &.{ .r32, .rm32 }, &.{ 0x0b }, 0, .none }, - .{ .@"or", .rm, &.{ .r64, .rm64 }, &.{ 0x0b }, 0, .long }, - - .{ .pop, .o, &.{ .r16 }, &.{ 0x58 }, 0, .none }, - .{ .pop, .o, &.{ .r64 }, &.{ 0x58 }, 0, .none }, - .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .none }, - .{ .pop, .m, &.{ .rm64 }, &.{ 0x8f }, 0, .none }, - - .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none }, - .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none }, - .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long }, - - .{ .push, .o, &.{ .r16 }, &.{ 0x50 }, 0, .none }, - .{ .push, .o, &.{ .r64 }, &.{ 0x50 }, 0, .none }, - .{ .push, .m, &.{ .rm16 }, &.{ 0xff }, 6, .none }, - .{ .push, .m, &.{ .rm64 }, &.{ 0xff }, 6, .none }, - .{ .push, .i, &.{ .imm8 }, &.{ 0x6a }, 0, .none }, - .{ .push, .i, &.{ .imm16 }, &.{ 0x68 }, 0, .none }, - .{ .push, .i, &.{ .imm32 }, &.{ 0x68 }, 0, .none }, - - .{ .ret, .np, &.{}, &.{ 0xc3 }, 0, .none }, - - .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .none }, - .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .rex }, - .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .none }, - .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .rex }, - .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .none }, - .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .rex }, - .{ .rcl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 2, .none }, - .{ .rcl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 2, .none }, - .{ .rcl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 2, .none }, - .{ .rcl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 2, .none }, - .{ .rcl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 2, .long }, - .{ .rcl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 2, .none }, - .{ .rcl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 2, .long }, - .{ .rcl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 2, .none }, - .{ .rcl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 2, .long }, - - .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .none }, - .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .rex }, - .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .none }, - .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .rex }, - .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .none }, - .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .rex }, - .{ .rcr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 3, .none }, - .{ .rcr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 3, .none }, - .{ .rcr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 3, .none }, - .{ .rcr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 3, .none }, - .{ .rcr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 3, .long }, - .{ .rcr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 3, .none }, - .{ .rcr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 3, .long }, - .{ .rcr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 3, .none }, - .{ .rcr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 3, .long }, - - .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .none }, - .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .rex }, - .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .none }, - .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .rex }, - .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .none }, - .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .rex }, - .{ .rol, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 0, .none }, - .{ .rol, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 0, .none }, - .{ .rol, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 0, .none }, - .{ .rol, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 0, .none }, - .{ .rol, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 0, .long }, - .{ .rol, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 0, .none }, - .{ .rol, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 0, .long }, - .{ .rol, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 0, .none }, - .{ .rol, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 0, .long }, - - .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .none }, - .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .rex }, - .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .none }, - .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .rex }, - .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .none }, - .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .rex }, - .{ .ror, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 1, .none }, - .{ .ror, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 1, .none }, - .{ .ror, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 1, .none }, - .{ .ror, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 1, .none }, - .{ .ror, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 1, .long }, - .{ .ror, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 1, .none }, - .{ .ror, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 1, .long }, - .{ .ror, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 1, .none }, - .{ .ror, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 1, .long }, - - .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none }, - .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex }, - .{ .sal, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .none }, - .{ .sal, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none }, - .{ .sal, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long }, - .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none }, - .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex }, - .{ .sal, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .none }, - .{ .sal, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none }, - .{ .sal, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long }, - .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none }, - .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex }, - .{ .sal, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .none }, - .{ .sal, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none }, - .{ .sal, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long }, - - .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .none }, - .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .rex }, - .{ .sar, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 7, .none }, - .{ .sar, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 7, .none }, - .{ .sar, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 7, .long }, - .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .none }, - .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .rex }, - .{ .sar, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 7, .none }, - .{ .sar, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 7, .none }, - .{ .sar, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 7, .long }, - .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .none }, - .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .rex }, - .{ .sar, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 7, .none }, - .{ .sar, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 7, .none }, - .{ .sar, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 7, .long }, - - .{ .sbb, .zi, &.{ .al, .imm8 }, &.{ 0x1c }, 0, .none }, - .{ .sbb, .zi, &.{ .ax, .imm16 }, &.{ 0x1d }, 0, .none }, - .{ .sbb, .zi, &.{ .eax, .imm32 }, &.{ 0x1d }, 0, .none }, - .{ .sbb, .zi, &.{ .rax, .imm32s }, &.{ 0x1d }, 0, .long }, - .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .none }, - .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .rex }, - .{ .sbb, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 3, .none }, - .{ .sbb, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 3, .none }, - .{ .sbb, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 3, .long }, - .{ .sbb, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 3, .none }, - .{ .sbb, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 3, .none }, - .{ .sbb, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 3, .long }, - .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .none }, - .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .rex }, - .{ .sbb, .mr, &.{ .rm16, .r16 }, &.{ 0x19 }, 0, .none }, - .{ .sbb, .mr, &.{ .rm32, .r32 }, &.{ 0x19 }, 0, .none }, - .{ .sbb, .mr, &.{ .rm64, .r64 }, &.{ 0x19 }, 0, .long }, - .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .none }, - .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .rex }, - .{ .sbb, .rm, &.{ .r16, .rm16 }, &.{ 0x1b }, 0, .none }, - .{ .sbb, .rm, &.{ .r32, .rm32 }, &.{ 0x1b }, 0, .none }, - .{ .sbb, .rm, &.{ .r64, .rm64 }, &.{ 0x1b }, 0, .long }, - - .{ .scas, .np, &.{ .m8 }, &.{ 0xae }, 0, .none }, - .{ .scas, .np, &.{ .m16 }, &.{ 0xaf }, 0, .none }, - .{ .scas, .np, &.{ .m32 }, &.{ 0xaf }, 0, .none }, - .{ .scas, .np, &.{ .m64 }, &.{ 0xaf }, 0, .long }, - - .{ .scasb, .np, &.{}, &.{ 0xae }, 0, .none }, - .{ .scasw, .np, &.{}, &.{ 0xaf }, 0, .short }, - .{ .scasd, .np, &.{}, &.{ 0xaf }, 0, .none }, - .{ .scasq, .np, &.{}, &.{ 0xaf }, 0, .long }, - - .{ .seta, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .none }, - .{ .seta, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .rex }, - .{ .setae, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .none }, - .{ .setae, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .rex }, - .{ .setb, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .none }, - .{ .setb, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .rex }, - .{ .setbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .none }, - .{ .setbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .rex }, - .{ .setc, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .none }, - .{ .setc, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .rex }, - .{ .sete, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .none }, - .{ .sete, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .rex }, - .{ .setg, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .none }, - .{ .setg, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .rex }, - .{ .setge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .none }, - .{ .setge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .rex }, - .{ .setl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .none }, - .{ .setl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .rex }, - .{ .setle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .none }, - .{ .setle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .rex }, - .{ .setna, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .none }, - .{ .setna, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .rex }, - .{ .setnae, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .none }, - .{ .setnae, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .rex }, - .{ .setnb, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .none }, - .{ .setnb, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .rex }, - .{ .setnbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .none }, - .{ .setnbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .rex }, - .{ .setnc, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .none }, - .{ .setnc, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .rex }, - .{ .setne, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .none }, - .{ .setne, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .rex }, - .{ .setng, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .none }, - .{ .setng, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .rex }, - .{ .setnge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .none }, - .{ .setnge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .rex }, - .{ .setnl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .none }, - .{ .setnl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .rex }, - .{ .setnle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .none }, - .{ .setnle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .rex }, - .{ .setno, .m, &.{ .rm8 }, &.{ 0x0f, 0x91 }, 0, .none }, - .{ .setno, .m, &.{ .rm8 }, &.{ 0x0f, 0x91 }, 0, .rex }, - .{ .setnp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .none }, - .{ .setnp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .rex }, - .{ .setns, .m, &.{ .rm8 }, &.{ 0x0f, 0x99 }, 0, .none }, - .{ .setns, .m, &.{ .rm8 }, &.{ 0x0f, 0x99 }, 0, .rex }, - .{ .setnz, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .none }, - .{ .setnz, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .rex }, - .{ .seto, .m, &.{ .rm8 }, &.{ 0x0f, 0x90 }, 0, .none }, - .{ .seto, .m, &.{ .rm8 }, &.{ 0x0f, 0x90 }, 0, .rex }, - .{ .setp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .none }, - .{ .setp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .rex }, - .{ .setpe, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .none }, - .{ .setpe, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .rex }, - .{ .setpo, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .none }, - .{ .setpo, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .rex }, - .{ .sets, .m, &.{ .rm8 }, &.{ 0x0f, 0x98 }, 0, .none }, - .{ .sets, .m, &.{ .rm8 }, &.{ 0x0f, 0x98 }, 0, .rex }, - .{ .setz, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .none }, - .{ .setz, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .rex }, - - .{ .sfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf8 }, 0, .none }, - - .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none }, - .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex }, - .{ .shl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .none }, - .{ .shl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none }, - .{ .shl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long }, - .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none }, - .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex }, - .{ .shl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .none }, - .{ .shl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none }, - .{ .shl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long }, - .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none }, - .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex }, - .{ .shl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .none }, - .{ .shl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none }, - .{ .shl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long }, - - .{ .shld, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none }, - .{ .shld, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xa5 }, 0, .none }, - .{ .shld, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none }, - .{ .shld, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .long }, - .{ .shld, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xa5 }, 0, .none }, - .{ .shld, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xa5 }, 0, .long }, - - .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .none }, - .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .rex }, - .{ .shr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 5, .none }, - .{ .shr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 5, .none }, - .{ .shr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 5, .long }, - .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .none }, - .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .rex }, - .{ .shr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 5, .none }, - .{ .shr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 5, .none }, - .{ .shr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 5, .long }, - .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .none }, - .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .rex }, - .{ .shr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 5, .none }, - .{ .shr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 5, .none }, - .{ .shr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 5, .long }, - - .{ .shrd, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xac }, 0, .none }, - .{ .shrd, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xad }, 0, .none }, - .{ .shrd, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xac }, 0, .none }, - .{ .shrd, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xac }, 0, .long }, - .{ .shrd, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xad }, 0, .none }, - .{ .shrd, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xad }, 0, .long }, - - .{ .stos, .np, &.{ .m8 }, &.{ 0xaa }, 0, .none }, - .{ .stos, .np, &.{ .m16 }, &.{ 0xab }, 0, .none }, - .{ .stos, .np, &.{ .m32 }, &.{ 0xab }, 0, .none }, - .{ .stos, .np, &.{ .m64 }, &.{ 0xab }, 0, .long }, - - .{ .stosb, .np, &.{}, &.{ 0xaa }, 0, .none }, - .{ .stosw, .np, &.{}, &.{ 0xab }, 0, .short }, - .{ .stosd, .np, &.{}, &.{ 0xab }, 0, .none }, - .{ .stosq, .np, &.{}, &.{ 0xab }, 0, .long }, - - .{ .sub, .zi, &.{ .al, .imm8 }, &.{ 0x2c }, 0, .none }, - .{ .sub, .zi, &.{ .ax, .imm16 }, &.{ 0x2d }, 0, .none }, - .{ .sub, .zi, &.{ .eax, .imm32 }, &.{ 0x2d }, 0, .none }, - .{ .sub, .zi, &.{ .rax, .imm32s }, &.{ 0x2d }, 0, .long }, - .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .none }, - .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .rex }, - .{ .sub, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 5, .none }, - .{ .sub, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 5, .none }, - .{ .sub, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 5, .long }, - .{ .sub, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 5, .none }, - .{ .sub, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 5, .none }, - .{ .sub, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 5, .long }, - .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .none }, - .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .rex }, - .{ .sub, .mr, &.{ .rm16, .r16 }, &.{ 0x29 }, 0, .none }, - .{ .sub, .mr, &.{ .rm32, .r32 }, &.{ 0x29 }, 0, .none }, - .{ .sub, .mr, &.{ .rm64, .r64 }, &.{ 0x29 }, 0, .long }, - .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .none }, - .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .rex }, - .{ .sub, .rm, &.{ .r16, .rm16 }, &.{ 0x2b }, 0, .none }, - .{ .sub, .rm, &.{ .r32, .rm32 }, &.{ 0x2b }, 0, .none }, - .{ .sub, .rm, &.{ .r64, .rm64 }, &.{ 0x2b }, 0, .long }, - - .{ .syscall, .np, &.{}, &.{ 0x0f, 0x05 }, 0, .none } -, - .{ .@"test", .zi, &.{ .al, .imm8 }, &.{ 0xa8 }, 0, .none }, - .{ .@"test", .zi, &.{ .ax, .imm16 }, &.{ 0xa9 }, 0, .none }, - .{ .@"test", .zi, &.{ .eax, .imm32 }, &.{ 0xa9 }, 0, .none }, - .{ .@"test", .zi, &.{ .rax, .imm32s }, &.{ 0xa9 }, 0, .long }, - .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .none }, - .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .rex }, - .{ .@"test", .mi, &.{ .rm16, .imm16 }, &.{ 0xf7 }, 0, .none }, - .{ .@"test", .mi, &.{ .rm32, .imm32 }, &.{ 0xf7 }, 0, .none }, - .{ .@"test", .mi, &.{ .rm64, .imm32s }, &.{ 0xf7 }, 0, .long }, - .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .none }, - .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .rex }, - .{ .@"test", .mr, &.{ .rm16, .r16 }, &.{ 0x85 }, 0, .none }, - .{ .@"test", .mr, &.{ .rm32, .r32 }, &.{ 0x85 }, 0, .none }, - .{ .@"test", .mr, &.{ .rm64, .r64 }, &.{ 0x85 }, 0, .long }, - - .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none }, - .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none }, - .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long }, - - .{ .ud2, .np, &.{}, &.{ 0x0f, 0x0b }, 0, .none }, - - .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .none }, - .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .rex }, - .{ .xadd, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xc1 }, 0, .none }, - .{ .xadd, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xc1 }, 0, .none }, - .{ .xadd, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xc1 }, 0, .long }, - - .{ .xchg, .o, &.{ .ax, .r16 }, &.{ 0x90 }, 0, .none }, - .{ .xchg, .o, &.{ .r16, .ax }, &.{ 0x90 }, 0, .none }, - .{ .xchg, .o, &.{ .eax, .r32 }, &.{ 0x90 }, 0, .none }, - .{ .xchg, .o, &.{ .rax, .r64 }, &.{ 0x90 }, 0, .long }, - .{ .xchg, .o, &.{ .r32, .eax }, &.{ 0x90 }, 0, .none }, - .{ .xchg, .o, &.{ .r64, .rax }, &.{ 0x90 }, 0, .long }, - .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .none }, - .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .rex }, - .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .none }, - .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .rex }, - .{ .xchg, .mr, &.{ .rm16, .r16 }, &.{ 0x87 }, 0, .none }, - .{ .xchg, .rm, &.{ .r16, .rm16 }, &.{ 0x87 }, 0, .none }, - .{ .xchg, .mr, &.{ .rm32, .r32 }, &.{ 0x87 }, 0, .none }, - .{ .xchg, .mr, &.{ .rm64, .r64 }, &.{ 0x87 }, 0, .long }, - .{ .xchg, .rm, &.{ .r32, .rm32 }, &.{ 0x87 }, 0, .none }, - .{ .xchg, .rm, &.{ .r64, .rm64 }, &.{ 0x87 }, 0, .long }, - - .{ .xor, .zi, &.{ .al, .imm8 }, &.{ 0x34 }, 0, .none }, - .{ .xor, .zi, &.{ .ax, .imm16 }, &.{ 0x35 }, 0, .none }, - .{ .xor, .zi, &.{ .eax, .imm32 }, &.{ 0x35 }, 0, .none }, - .{ .xor, .zi, &.{ .rax, .imm32s }, &.{ 0x35 }, 0, .long }, - .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .none }, - .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .rex }, - .{ .xor, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 6, .none }, - .{ .xor, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 6, .none }, - .{ .xor, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 6, .long }, - .{ .xor, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 6, .none }, - .{ .xor, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 6, .none }, - .{ .xor, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 6, .long }, - .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .none }, - .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .rex }, - .{ .xor, .mr, &.{ .rm16, .r16 }, &.{ 0x31 }, 0, .none }, - .{ .xor, .mr, &.{ .rm32, .r32 }, &.{ 0x31 }, 0, .none }, - .{ .xor, .mr, &.{ .rm64, .r64 }, &.{ 0x31 }, 0, .long }, - .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .none }, - .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .rex }, - .{ .xor, .rm, &.{ .r16, .rm16 }, &.{ 0x33 }, 0, .none }, - .{ .xor, .rm, &.{ .r32, .rm32 }, &.{ 0x33 }, 0, .none }, - .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long }, + .{ .movsxd, .rm, &.{ .r32, .rm32 }, &.{ 0x63 }, 0, .none, .none }, + .{ .movsxd, .rm, &.{ .r64, .rm32 }, &.{ 0x63 }, 0, .long, .none }, + + .{ .movzx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none, .none }, + .{ .movzx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none, .none }, + .{ .movzx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .long, .none }, + .{ .movzx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .none, .none }, + .{ .movzx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .long, .none }, + + .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .none, .none }, + .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .rex, .none }, + .{ .mul, .m, &.{ .rm16 }, &.{ 0xf7 }, 4, .none, .none }, + .{ .mul, .m, &.{ .rm32 }, &.{ 0xf7 }, 4, .none, .none }, + .{ .mul, .m, &.{ .rm64 }, &.{ 0xf7 }, 4, .long, .none }, + + .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .none, .none }, + .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .rex, .none }, + .{ .neg, .m, &.{ .rm16 }, &.{ 0xf7 }, 3, .none, .none }, + .{ .neg, .m, &.{ .rm32 }, &.{ 0xf7 }, 3, .none, .none }, + .{ .neg, .m, &.{ .rm64 }, &.{ 0xf7 }, 3, .long, .none }, + + .{ .nop, .np, &.{}, &.{ 0x90 }, 0, .none, .none }, + + .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .none, .none }, + .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .rex, .none }, + .{ .not, .m, &.{ .rm16 }, &.{ 0xf7 }, 2, .none, .none }, + .{ .not, .m, &.{ .rm32 }, &.{ 0xf7 }, 2, .none, .none }, + .{ .not, .m, &.{ .rm64 }, &.{ 0xf7 }, 2, .long, .none }, + + .{ .@"or", .zi, &.{ .al, .imm8 }, &.{ 0x0c }, 0, .none, .none }, + .{ .@"or", .zi, &.{ .ax, .imm16 }, &.{ 0x0d }, 0, .none, .none }, + .{ .@"or", .zi, &.{ .eax, .imm32 }, &.{ 0x0d }, 0, .none, .none }, + .{ .@"or", .zi, &.{ .rax, .imm32s }, &.{ 0x0d }, 0, .long, .none }, + .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .rex, .none }, + .{ .@"or", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 1, .long, .none }, + .{ .@"or", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 1, .long, .none }, + .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .none, .none }, + .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .rex, .none }, + .{ .@"or", .mr, &.{ .rm16, .r16 }, &.{ 0x09 }, 0, .none, .none }, + .{ .@"or", .mr, &.{ .rm32, .r32 }, &.{ 0x09 }, 0, .none, .none }, + .{ .@"or", .mr, &.{ .rm64, .r64 }, &.{ 0x09 }, 0, .long, .none }, + .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .none, .none }, + .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .rex, .none }, + .{ .@"or", .rm, &.{ .r16, .rm16 }, &.{ 0x0b }, 0, .none, .none }, + .{ .@"or", .rm, &.{ .r32, .rm32 }, &.{ 0x0b }, 0, .none, .none }, + .{ .@"or", .rm, &.{ .r64, .rm64 }, &.{ 0x0b }, 0, .long, .none }, + + .{ .pop, .o, &.{ .r16 }, &.{ 0x58 }, 0, .none, .none }, + .{ .pop, .o, &.{ .r64 }, &.{ 0x58 }, 0, .none, .none }, + .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .none, .none }, + .{ .pop, .m, &.{ .rm64 }, &.{ 0x8f }, 0, .none, .none }, + + .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .none }, + .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .none }, + .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long, .none }, + + .{ .push, .o, &.{ .r16 }, &.{ 0x50 }, 0, .none, .none }, + .{ .push, .o, &.{ .r64 }, &.{ 0x50 }, 0, .none, .none }, + .{ .push, .m, &.{ .rm16 }, &.{ 0xff }, 6, .none, .none }, + .{ .push, .m, &.{ .rm64 }, &.{ 0xff }, 6, .none, .none }, + .{ .push, .i, &.{ .imm8 }, &.{ 0x6a }, 0, .none, .none }, + .{ .push, .i, &.{ .imm16 }, &.{ 0x68 }, 0, .none, .none }, + .{ .push, .i, &.{ .imm32 }, &.{ 0x68 }, 0, .none, .none }, + + .{ .ret, .np, &.{}, &.{ 0xc3 }, 0, .none, .none }, + + .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .none, .none }, + .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .rex, .none }, + .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .none, .none }, + .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .rex, .none }, + .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .none, .none }, + .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .rex, .none }, + .{ .rcl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 2, .none, .none }, + .{ .rcl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 2, .none, .none }, + .{ .rcl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 2, .none, .none }, + .{ .rcl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 2, .none, .none }, + .{ .rcl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 2, .long, .none }, + .{ .rcl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 2, .none, .none }, + .{ .rcl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 2, .long, .none }, + .{ .rcl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 2, .none, .none }, + .{ .rcl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 2, .long, .none }, + + .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .none, .none }, + .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .rex, .none }, + .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .none, .none }, + .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .rex, .none }, + .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .none, .none }, + .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .rex, .none }, + .{ .rcr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 3, .none, .none }, + .{ .rcr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 3, .none, .none }, + .{ .rcr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 3, .none, .none }, + .{ .rcr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 3, .none, .none }, + .{ .rcr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 3, .long, .none }, + .{ .rcr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 3, .none, .none }, + .{ .rcr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 3, .long, .none }, + .{ .rcr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 3, .none, .none }, + .{ .rcr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 3, .long, .none }, + + .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .none, .none }, + .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .rex, .none }, + .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .none, .none }, + .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .rex, .none }, + .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .none, .none }, + .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .rex, .none }, + .{ .rol, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 0, .none, .none }, + .{ .rol, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 0, .none, .none }, + .{ .rol, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 0, .none, .none }, + .{ .rol, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 0, .none, .none }, + .{ .rol, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 0, .long, .none }, + .{ .rol, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 0, .none, .none }, + .{ .rol, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 0, .long, .none }, + .{ .rol, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 0, .none, .none }, + .{ .rol, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 0, .long, .none }, + + .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .none, .none }, + .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .rex, .none }, + .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .none, .none }, + .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .rex, .none }, + .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .none, .none }, + .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .rex, .none }, + .{ .ror, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 1, .none, .none }, + .{ .ror, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 1, .none, .none }, + .{ .ror, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 1, .none, .none }, + .{ .ror, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 1, .none, .none }, + .{ .ror, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 1, .long, .none }, + .{ .ror, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 1, .none, .none }, + .{ .ror, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 1, .long, .none }, + .{ .ror, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 1, .none, .none }, + .{ .ror, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 1, .long, .none }, + + .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none, .none }, + .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex, .none }, + .{ .sal, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .none, .none }, + .{ .sal, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none, .none }, + .{ .sal, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long, .none }, + .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none, .none }, + .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex, .none }, + .{ .sal, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .none, .none }, + .{ .sal, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none, .none }, + .{ .sal, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long, .none }, + .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none, .none }, + .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex, .none }, + .{ .sal, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, + .{ .sal, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, + .{ .sal, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long, .none }, + + .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .none, .none }, + .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .rex, .none }, + .{ .sar, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 7, .none, .none }, + .{ .sar, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 7, .none, .none }, + .{ .sar, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 7, .long, .none }, + .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .none, .none }, + .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .rex, .none }, + .{ .sar, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 7, .none, .none }, + .{ .sar, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 7, .none, .none }, + .{ .sar, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 7, .long, .none }, + .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .none, .none }, + .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .rex, .none }, + .{ .sar, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 7, .none, .none }, + .{ .sar, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 7, .none, .none }, + .{ .sar, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 7, .long, .none }, + + .{ .sbb, .zi, &.{ .al, .imm8 }, &.{ 0x1c }, 0, .none, .none }, + .{ .sbb, .zi, &.{ .ax, .imm16 }, &.{ 0x1d }, 0, .none, .none }, + .{ .sbb, .zi, &.{ .eax, .imm32 }, &.{ 0x1d }, 0, .none, .none }, + .{ .sbb, .zi, &.{ .rax, .imm32s }, &.{ 0x1d }, 0, .long, .none }, + .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .rex, .none }, + .{ .sbb, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 3, .long, .none }, + .{ .sbb, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 3, .long, .none }, + .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .none, .none }, + .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .rex, .none }, + .{ .sbb, .mr, &.{ .rm16, .r16 }, &.{ 0x19 }, 0, .none, .none }, + .{ .sbb, .mr, &.{ .rm32, .r32 }, &.{ 0x19 }, 0, .none, .none }, + .{ .sbb, .mr, &.{ .rm64, .r64 }, &.{ 0x19 }, 0, .long, .none }, + .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .none, .none }, + .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .rex, .none }, + .{ .sbb, .rm, &.{ .r16, .rm16 }, &.{ 0x1b }, 0, .none, .none }, + .{ .sbb, .rm, &.{ .r32, .rm32 }, &.{ 0x1b }, 0, .none, .none }, + .{ .sbb, .rm, &.{ .r64, .rm64 }, &.{ 0x1b }, 0, .long, .none }, + + .{ .scas, .np, &.{ .m8 }, &.{ 0xae }, 0, .none, .none }, + .{ .scas, .np, &.{ .m16 }, &.{ 0xaf }, 0, .none, .none }, + .{ .scas, .np, &.{ .m32 }, &.{ 0xaf }, 0, .none, .none }, + .{ .scas, .np, &.{ .m64 }, &.{ 0xaf }, 0, .long, .none }, + + .{ .scasb, .np, &.{}, &.{ 0xae }, 0, .none, .none }, + .{ .scasw, .np, &.{}, &.{ 0xaf }, 0, .short, .none }, + .{ .scasd, .np, &.{}, &.{ 0xaf }, 0, .none, .none }, + .{ .scasq, .np, &.{}, &.{ 0xaf }, 0, .long, .none }, + + .{ .seta, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .none, .none }, + .{ .seta, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .rex, .none }, + .{ .setae, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .none, .none }, + .{ .setae, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .rex, .none }, + .{ .setb, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .none, .none }, + .{ .setb, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .rex, .none }, + .{ .setbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .none, .none }, + .{ .setbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .rex, .none }, + .{ .setc, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .none, .none }, + .{ .setc, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .rex, .none }, + .{ .sete, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .none, .none }, + .{ .sete, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .rex, .none }, + .{ .setg, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .none, .none }, + .{ .setg, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .rex, .none }, + .{ .setge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .none, .none }, + .{ .setge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .rex, .none }, + .{ .setl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .none, .none }, + .{ .setl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .rex, .none }, + .{ .setle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .none, .none }, + .{ .setle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .rex, .none }, + .{ .setna, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .none, .none }, + .{ .setna, .m, &.{ .rm8 }, &.{ 0x0f, 0x96 }, 0, .rex, .none }, + .{ .setnae, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .none, .none }, + .{ .setnae, .m, &.{ .rm8 }, &.{ 0x0f, 0x92 }, 0, .rex, .none }, + .{ .setnb, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .none, .none }, + .{ .setnb, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .rex, .none }, + .{ .setnbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .none, .none }, + .{ .setnbe, .m, &.{ .rm8 }, &.{ 0x0f, 0x97 }, 0, .rex, .none }, + .{ .setnc, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .none, .none }, + .{ .setnc, .m, &.{ .rm8 }, &.{ 0x0f, 0x93 }, 0, .rex, .none }, + .{ .setne, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .none, .none }, + .{ .setne, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .rex, .none }, + .{ .setng, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .none, .none }, + .{ .setng, .m, &.{ .rm8 }, &.{ 0x0f, 0x9e }, 0, .rex, .none }, + .{ .setnge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .none, .none }, + .{ .setnge, .m, &.{ .rm8 }, &.{ 0x0f, 0x9c }, 0, .rex, .none }, + .{ .setnl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .none, .none }, + .{ .setnl, .m, &.{ .rm8 }, &.{ 0x0f, 0x9d }, 0, .rex, .none }, + .{ .setnle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .none, .none }, + .{ .setnle, .m, &.{ .rm8 }, &.{ 0x0f, 0x9f }, 0, .rex, .none }, + .{ .setno, .m, &.{ .rm8 }, &.{ 0x0f, 0x91 }, 0, .none, .none }, + .{ .setno, .m, &.{ .rm8 }, &.{ 0x0f, 0x91 }, 0, .rex, .none }, + .{ .setnp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .none, .none }, + .{ .setnp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .rex, .none }, + .{ .setns, .m, &.{ .rm8 }, &.{ 0x0f, 0x99 }, 0, .none, .none }, + .{ .setns, .m, &.{ .rm8 }, &.{ 0x0f, 0x99 }, 0, .rex, .none }, + .{ .setnz, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .none, .none }, + .{ .setnz, .m, &.{ .rm8 }, &.{ 0x0f, 0x95 }, 0, .rex, .none }, + .{ .seto, .m, &.{ .rm8 }, &.{ 0x0f, 0x90 }, 0, .none, .none }, + .{ .seto, .m, &.{ .rm8 }, &.{ 0x0f, 0x90 }, 0, .rex, .none }, + .{ .setp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .none, .none }, + .{ .setp, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .rex, .none }, + .{ .setpe, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .none, .none }, + .{ .setpe, .m, &.{ .rm8 }, &.{ 0x0f, 0x9a }, 0, .rex, .none }, + .{ .setpo, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .none, .none }, + .{ .setpo, .m, &.{ .rm8 }, &.{ 0x0f, 0x9b }, 0, .rex, .none }, + .{ .sets, .m, &.{ .rm8 }, &.{ 0x0f, 0x98 }, 0, .none, .none }, + .{ .sets, .m, &.{ .rm8 }, &.{ 0x0f, 0x98 }, 0, .rex, .none }, + .{ .setz, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .none, .none }, + .{ .setz, .m, &.{ .rm8 }, &.{ 0x0f, 0x94 }, 0, .rex, .none }, + + .{ .sfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf8 }, 0, .none, .none }, + + .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none, .none }, + .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex, .none }, + .{ .shl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .none, .none }, + .{ .shl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none, .none }, + .{ .shl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long, .none }, + .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none, .none }, + .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex, .none }, + .{ .shl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .none, .none }, + .{ .shl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none, .none }, + .{ .shl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long, .none }, + .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none, .none }, + .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex, .none }, + .{ .shl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, + .{ .shl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, + .{ .shl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long, .none }, + + .{ .shld, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none, .none }, + .{ .shld, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xa5 }, 0, .none, .none }, + .{ .shld, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none, .none }, + .{ .shld, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .long, .none }, + .{ .shld, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xa5 }, 0, .none, .none }, + .{ .shld, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xa5 }, 0, .long, .none }, + + .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .none, .none }, + .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .rex, .none }, + .{ .shr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 5, .none, .none }, + .{ .shr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 5, .none, .none }, + .{ .shr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 5, .long, .none }, + .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .none, .none }, + .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .rex, .none }, + .{ .shr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 5, .none, .none }, + .{ .shr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 5, .none, .none }, + .{ .shr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 5, .long, .none }, + .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .none, .none }, + .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .rex, .none }, + .{ .shr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 5, .none, .none }, + .{ .shr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 5, .none, .none }, + .{ .shr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 5, .long, .none }, + + .{ .shrd, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xac }, 0, .none, .none }, + .{ .shrd, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xad }, 0, .none, .none }, + .{ .shrd, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xac }, 0, .none, .none }, + .{ .shrd, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xac }, 0, .long, .none }, + .{ .shrd, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xad }, 0, .none, .none }, + .{ .shrd, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xad }, 0, .long, .none }, + + .{ .stos, .np, &.{ .m8 }, &.{ 0xaa }, 0, .none, .none }, + .{ .stos, .np, &.{ .m16 }, &.{ 0xab }, 0, .none, .none }, + .{ .stos, .np, &.{ .m32 }, &.{ 0xab }, 0, .none, .none }, + .{ .stos, .np, &.{ .m64 }, &.{ 0xab }, 0, .long, .none }, + + .{ .stosb, .np, &.{}, &.{ 0xaa }, 0, .none, .none }, + .{ .stosw, .np, &.{}, &.{ 0xab }, 0, .short, .none }, + .{ .stosd, .np, &.{}, &.{ 0xab }, 0, .none, .none }, + .{ .stosq, .np, &.{}, &.{ 0xab }, 0, .long, .none }, + + .{ .sub, .zi, &.{ .al, .imm8 }, &.{ 0x2c }, 0, .none, .none }, + .{ .sub, .zi, &.{ .ax, .imm16 }, &.{ 0x2d }, 0, .none, .none }, + .{ .sub, .zi, &.{ .eax, .imm32 }, &.{ 0x2d }, 0, .none, .none }, + .{ .sub, .zi, &.{ .rax, .imm32s }, &.{ 0x2d }, 0, .long, .none }, + .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .rex, .none }, + .{ .sub, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 5, .long, .none }, + .{ .sub, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 5, .long, .none }, + .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .none, .none }, + .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .rex, .none }, + .{ .sub, .mr, &.{ .rm16, .r16 }, &.{ 0x29 }, 0, .none, .none }, + .{ .sub, .mr, &.{ .rm32, .r32 }, &.{ 0x29 }, 0, .none, .none }, + .{ .sub, .mr, &.{ .rm64, .r64 }, &.{ 0x29 }, 0, .long, .none }, + .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .none, .none }, + .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .rex, .none }, + .{ .sub, .rm, &.{ .r16, .rm16 }, &.{ 0x2b }, 0, .none, .none }, + .{ .sub, .rm, &.{ .r32, .rm32 }, &.{ 0x2b }, 0, .none, .none }, + .{ .sub, .rm, &.{ .r64, .rm64 }, &.{ 0x2b }, 0, .long, .none }, + + .{ .syscall, .np, &.{}, &.{ 0x0f, 0x05 }, 0, .none, .none }, + + .{ .@"test", .zi, &.{ .al, .imm8 }, &.{ 0xa8 }, 0, .none, .none }, + .{ .@"test", .zi, &.{ .ax, .imm16 }, &.{ 0xa9 }, 0, .none, .none }, + .{ .@"test", .zi, &.{ .eax, .imm32 }, &.{ 0xa9 }, 0, .none, .none }, + .{ .@"test", .zi, &.{ .rax, .imm32s }, &.{ 0xa9 }, 0, .long, .none }, + .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .none, .none }, + .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .rex, .none }, + .{ .@"test", .mi, &.{ .rm16, .imm16 }, &.{ 0xf7 }, 0, .none, .none }, + .{ .@"test", .mi, &.{ .rm32, .imm32 }, &.{ 0xf7 }, 0, .none, .none }, + .{ .@"test", .mi, &.{ .rm64, .imm32s }, &.{ 0xf7 }, 0, .long, .none }, + .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .none, .none }, + .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .rex, .none }, + .{ .@"test", .mr, &.{ .rm16, .r16 }, &.{ 0x85 }, 0, .none, .none }, + .{ .@"test", .mr, &.{ .rm32, .r32 }, &.{ 0x85 }, 0, .none, .none }, + .{ .@"test", .mr, &.{ .rm64, .r64 }, &.{ 0x85 }, 0, .long, .none }, + + .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .none }, + .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .none }, + .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long, .none }, + + .{ .ud2, .np, &.{}, &.{ 0x0f, 0x0b }, 0, .none, .none }, + + .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .none, .none }, + .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .rex, .none }, + .{ .xadd, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xc1 }, 0, .none, .none }, + .{ .xadd, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xc1 }, 0, .none, .none }, + .{ .xadd, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xc1 }, 0, .long, .none }, + + .{ .xchg, .o, &.{ .ax, .r16 }, &.{ 0x90 }, 0, .none, .none }, + .{ .xchg, .o, &.{ .r16, .ax }, &.{ 0x90 }, 0, .none, .none }, + .{ .xchg, .o, &.{ .eax, .r32 }, &.{ 0x90 }, 0, .none, .none }, + .{ .xchg, .o, &.{ .rax, .r64 }, &.{ 0x90 }, 0, .long, .none }, + .{ .xchg, .o, &.{ .r32, .eax }, &.{ 0x90 }, 0, .none, .none }, + .{ .xchg, .o, &.{ .r64, .rax }, &.{ 0x90 }, 0, .long, .none }, + .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .none, .none }, + .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .rex, .none }, + .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .none, .none }, + .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .rex, .none }, + .{ .xchg, .mr, &.{ .rm16, .r16 }, &.{ 0x87 }, 0, .none, .none }, + .{ .xchg, .rm, &.{ .r16, .rm16 }, &.{ 0x87 }, 0, .none, .none }, + .{ .xchg, .mr, &.{ .rm32, .r32 }, &.{ 0x87 }, 0, .none, .none }, + .{ .xchg, .mr, &.{ .rm64, .r64 }, &.{ 0x87 }, 0, .long, .none }, + .{ .xchg, .rm, &.{ .r32, .rm32 }, &.{ 0x87 }, 0, .none, .none }, + .{ .xchg, .rm, &.{ .r64, .rm64 }, &.{ 0x87 }, 0, .long, .none }, + + .{ .xor, .zi, &.{ .al, .imm8 }, &.{ 0x34 }, 0, .none, .none }, + .{ .xor, .zi, &.{ .ax, .imm16 }, &.{ 0x35 }, 0, .none, .none }, + .{ .xor, .zi, &.{ .eax, .imm32 }, &.{ 0x35 }, 0, .none, .none }, + .{ .xor, .zi, &.{ .rax, .imm32s }, &.{ 0x35 }, 0, .long, .none }, + .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .rex, .none }, + .{ .xor, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 6, .long, .none }, + .{ .xor, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 6, .long, .none }, + .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .none, .none }, + .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .rex, .none }, + .{ .xor, .mr, &.{ .rm16, .r16 }, &.{ 0x31 }, 0, .none, .none }, + .{ .xor, .mr, &.{ .rm32, .r32 }, &.{ 0x31 }, 0, .none, .none }, + .{ .xor, .mr, &.{ .rm64, .r64 }, &.{ 0x31 }, 0, .long, .none }, + .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .none, .none }, + .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .rex, .none }, + .{ .xor, .rm, &.{ .r16, .rm16 }, &.{ 0x33 }, 0, .none, .none }, + .{ .xor, .rm, &.{ .r32, .rm32 }, &.{ 0x33 }, 0, .none, .none }, + .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none }, // SSE - .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .sse }, + .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .none, .sse }, - .{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .sse }, + .{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .none, .sse }, - .{ .andps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .sse }, + .{ .andps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .none, .sse }, - .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .sse }, + .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .none, .sse }, - .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .sse }, - .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .sse_long }, + .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .none, .sse }, + .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .long, .sse }, - .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .sse }, + .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .none, .sse }, - .{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .sse }, + .{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .none, .sse }, - .{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .sse }, + .{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .none, .sse }, - .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .sse }, - .{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .sse }, + .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse }, + .{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .none, .sse }, - .{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .sse }, - .{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .sse }, + .{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse }, + .{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse }, - .{ .movups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .sse }, - .{ .movups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .sse }, + .{ .movups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .none, .sse }, + .{ .movups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .none, .sse }, - .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .sse }, + .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .none, .sse }, - .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .sse }, + .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse }, - .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .sse }, + .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse }, - .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .sse }, - .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .sse }, + .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse }, + .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .none, .sse }, - .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .sse }, + .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .none, .sse }, - .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .sse }, + .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse }, // SSE2 - .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .sse2 }, + .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .none, .sse2 }, - .{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .sse2 }, + .{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .none, .sse2 }, - .{ .andpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x54 }, 0, .sse2 }, + .{ .andpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x54 }, 0, .none, .sse2 }, - .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .sse2 }, + .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .none, .sse2 }, - .{ .cvtsd2ss, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .sse2 }, + .{ .cvtsd2ss, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .none, .sse2 }, - .{ .cvtsi2sd, .rm, &.{ .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .sse2 }, - .{ .cvtsi2sd, .rm, &.{ .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .sse2_long }, + .{ .cvtsi2sd, .rm, &.{ .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .none, .sse2 }, + .{ .cvtsi2sd, .rm, &.{ .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .long, .sse2 }, - .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .sse2 }, + .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .none, .sse2 }, - .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .sse2 }, + .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .none, .sse2 }, - .{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .sse2 }, + .{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .none, .sse2 }, - .{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .sse2 }, + .{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .none, .sse2 }, - .{ .movapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .sse2 }, - .{ .movapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .sse2 }, + .{ .movapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .none, .sse2 }, + .{ .movapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .none, .sse2 }, - .{ .movd, .rm, &.{ .xmm, .rm32 }, &.{ 0x66, 0x0f, 0x6e }, 0, .sse2 }, - .{ .movd, .mr, &.{ .rm32, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .sse2 }, + .{ .movd, .rm, &.{ .xmm, .rm32 }, &.{ 0x66, 0x0f, 0x6e }, 0, .none, .sse2 }, + .{ .movd, .mr, &.{ .rm32, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .none, .sse2 }, - .{ .movq, .rm, &.{ .xmm, .rm64 }, &.{ 0x66, 0x0f, 0x6e }, 0, .sse2_long }, - .{ .movq, .mr, &.{ .rm64, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .sse2_long }, + .{ .movq, .rm, &.{ .xmm, .rm64 }, &.{ 0x66, 0x0f, 0x6e }, 0, .long, .sse2 }, + .{ .movq, .mr, &.{ .rm64, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .long, .sse2 }, - .{ .movq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0x7e }, 0, .sse2 }, - .{ .movq, .mr, &.{ .xmm_m64, .xmm }, &.{ 0x66, 0x0f, 0xd6 }, 0, .sse2 }, + .{ .movq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0x7e }, 0, .none, .sse2 }, + .{ .movq, .mr, &.{ .xmm_m64, .xmm }, &.{ 0x66, 0x0f, 0xd6 }, 0, .none, .sse2 }, - .{ .movupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .sse2 }, - .{ .movupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .sse2 }, + .{ .movupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .none, .sse2 }, + .{ .movupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .none, .sse2 }, - .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .sse2 }, + .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .none, .sse2 }, - .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .sse2 }, + .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, - .{ .pextrw, .mri, &.{ .r16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .sse2 }, + .{ .pextrw, .mri, &.{ .r16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .none, .sse2 }, - .{ .pinsrw, .rmi, &.{ .xmm, .rm16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .sse2 }, + .{ .pinsrw, .rmi, &.{ .xmm, .rm16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, - .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .sse2 }, - .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .sse2 }, + .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 }, + .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, - .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .sse2 }, + .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .none, .sse2 }, - .{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .sse2 }, - .{ .movsd, .mr, &.{ .xmm_m64, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .sse2 }, + .{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .none, .sse2 }, + .{ .movsd, .mr, &.{ .xmm_m64, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .none, .sse2 }, - .{ .ucomisd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x2e }, 0, .sse2 }, + .{ .ucomisd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x2e }, 0, .none, .sse2 }, - .{ .xorpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .sse2 }, + .{ .xorpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .none, .sse2 }, // SSE4.1 - .{ .pextrw, .mri, &.{ .rm16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .sse4_1 }, + .{ .pextrw, .mri, &.{ .rm16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, - .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .sse4_1 }, - .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .sse4_1 }, + .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, + .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 }, + + // F16C + .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128, .f16c }, + + .{ .vcvtps2ph, .mri, &.{ .xmm_m64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x1d }, 0, .vex_128, .f16c }, }; // zig fmt: on diff --git a/test/behavior/vector.zig b/test/behavior/vector.zig index b0e717d131..41b0bfc39b 100644 --- a/test/behavior/vector.zig +++ b/test/behavior/vector.zig @@ -168,7 +168,8 @@ test "array to vector" { test "array to vector with element type coercion" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .f16c)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From ae588a09f2c2146ada0f914c7d279f69a0d79396 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Fri, 5 May 2023 22:16:13 -0400 Subject: x86_64: implement f16 cmp --- src/arch/x86_64/CodeGen.zig | 53 +- src/arch/x86_64/Encoding.zig | 163 ++--- src/arch/x86_64/Lower.zig | 42 +- src/arch/x86_64/Mir.zig | 80 ++- src/arch/x86_64/encoder.zig | 49 +- src/arch/x86_64/encodings.zig | 1361 ++++++++++++++++++++++------------------- 6 files changed, 989 insertions(+), 759 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index b7fd81db68..d24428467a 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -6737,26 +6737,43 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); const src_mcv = if (flipped) lhs_mcv else rhs_mcv; - try self.genBinOpMir(switch (ty.zigTypeTag()) { - else => .cmp, + switch (ty.zigTypeTag()) { + else => try self.genBinOpMir(.cmp, ty, dst_mcv, src_mcv), .Float => switch (ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .ucomiss - else - return self.fail("TODO implement airCmp for {} without sse", .{ - ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .ucomisd - else - return self.fail("TODO implement airCmp for {} without sse2", .{ - ty.fmt(self.bin_file.options.module.?), - }), + 16 => if (self.hasFeature(.f16c)) { + const dst_reg = dst_mcv.getReg().?.to128(); + + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isRegister()) + try self.asmRegisterRegisterRegister( + .vpunpcklwd, + dst_reg, + dst_reg, + src_mcv.getReg().?.to128(), + ) + else + try self.asmRegisterMemoryImmediate( + .vpinsrw, + dst_reg, + src_mcv.mem(.word), + Immediate.u(1), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg); + try self.genBinOpMir(.ucomiss, ty, dst_mcv, .{ .register = tmp_reg }); + } else return self.fail("TODO implement airCmp for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), + 32 => try self.genBinOpMir(.ucomiss, ty, dst_mcv, src_mcv), + 64 => try self.genBinOpMir(.ucomisd, ty, dst_mcv, src_mcv), else => return self.fail("TODO implement airCmp for {}", .{ ty.fmt(self.bin_file.options.module.?), }), }, - }, ty, dst_mcv, src_mcv); + } const signedness = if (ty.isAbiInt()) ty.intInfo(self.target.*).signedness else .unsigned; const result = MCValue{ @@ -7834,8 +7851,8 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr else switch (abi_size) { 2 => return try self.asmRegisterRegisterImmediate( if (dst_reg.class() == .floating_point) .pinsrw else .pextrw, - registerAlias(dst_reg, abi_size), - registerAlias(src_reg, abi_size), + registerAlias(dst_reg, 4), + registerAlias(src_reg, 4), Immediate.u(0), ), 4 => .movd, @@ -8045,7 +8062,7 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal try self.asmMemoryRegisterImmediate( .pextrw, dst_mem, - registerAlias(src_reg, abi_size), + src_reg.to128(), Immediate.u(0), ) else diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 05c48ecddf..ada1e891fb 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -58,9 +58,9 @@ pub fn findByMnemonic( var shortest_len: ?usize = null; next: for (mnemonic_to_encodings_map[@enumToInt(mnemonic)]) |data| { switch (data.mode) { - .rex => if (!rex_required) continue, - .long => {}, - else => if (rex_required) continue, + .none, .short => if (rex_required) continue, + .rex, .rex_short => if (!rex_required) continue, + else => {}, } for (input_ops, data.ops) |input_op, data_op| if (!input_op.isSubset(data_op)) continue :next; @@ -90,24 +90,26 @@ pub fn findByOpcode(opc: []const u8, prefixes: struct { if (!std.mem.eql(u8, opc, enc.opcode())) continue; if (prefixes.rex.w) { switch (data.mode) { - .short, .fpu, .sse, .sse2, .sse4_1, .none => continue, - .long, .sse_long, .sse2_long, .rex => {}, + .none, .short, .rex, .rex_short, .vex_128, .vex_256 => continue, + .long, .vex_128_long, .vex_256_long => {}, } } else if (prefixes.rex.present and !prefixes.rex.isSet()) { switch (data.mode) { - .rex => {}, + .rex, .rex_short => {}, else => continue, } } else if (prefixes.legacy.prefix_66) { - switch (enc.operandBitSize()) { - 16 => {}, - else => continue, + switch (data.mode) { + .short, .rex_short => {}, + .none, .rex, .vex_128, .vex_256 => continue, + .long, .vex_128_long, .vex_256_long => continue, } } else { switch (data.mode) { - .none => switch (enc.operandBitSize()) { - 16 => continue, - else => {}, + .none => switch (data.mode) { + .short, .rex_short => continue, + .none, .rex, .vex_128, .vex_256 => {}, + .long, .vex_128_long, .vex_256_long => {}, }, else => continue, } @@ -131,28 +133,11 @@ pub fn mandatoryPrefix(encoding: *const Encoding) ?u8 { pub fn modRmExt(encoding: Encoding) u3 { return switch (encoding.data.op_en) { - .m, .mi, .m1, .mc => encoding.data.modrm_ext, + .m, .mi, .m1, .mc, .vmi => encoding.data.modrm_ext, else => unreachable, }; } -pub fn operandBitSize(encoding: Encoding) u64 { - return switch (encoding.data.mode) { - .short => 16, - .long => 64, - else => switch (encoding.data.op_en) { - .np => switch (encoding.data.ops[0]) { - .o16 => 16, - .o32 => 32, - .o64 => 64, - else => 32, - }, - .td => encoding.data.ops[1].bitSize(), - else => encoding.data.ops[0].bitSize(), - }, - }; -} - pub fn format( encoding: Encoding, comptime fmt: []const u8, @@ -220,17 +205,17 @@ pub fn format( }; try writer.print("+{s} ", .{tag}); }, - .m, .mi, .m1, .mc => try writer.print("/{d} ", .{encoding.modRmExt()}), - .mr, .rm, .rmi, .mri, .mrc, .rrm, .rrmi => try writer.writeAll("/r "), + .m, .mi, .m1, .mc, .vmi => try writer.print("/{d} ", .{encoding.modRmExt()}), + .mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi => try writer.writeAll("/r "), } switch (encoding.data.op_en) { - .i, .d, .zi, .oi, .mi, .rmi, .mri, .rrmi => { + .i, .d, .zi, .oi, .mi, .rmi, .mri, .vmi, .rvmi => { const op = switch (encoding.data.op_en) { .i, .d => encoding.data.ops[0], .zi, .oi, .mi => encoding.data.ops[1], - .rmi, .mri => encoding.data.ops[2], - .rrmi => encoding.data.ops[3], + .rmi, .mri, .vmi => encoding.data.ops[2], + .rvmi => encoding.data.ops[3], else => unreachable, }; const tag = switch (op) { @@ -245,7 +230,7 @@ pub fn format( }; try writer.print("{s} ", .{tag}); }, - .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rrm => {}, + .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm => {}, } try writer.print("{s} ", .{@tagName(encoding.mnemonic)}); @@ -315,8 +300,7 @@ pub const Mnemonic = enum { movaps, movss, movups, mulss, orps, - pextrw, - pinsrw, + pextrw, pinsrw, sqrtps, sqrtss, subss, @@ -335,14 +319,25 @@ pub const Mnemonic = enum { movupd, mulsd, orpd, - sqrtpd, - sqrtsd, + pshufhw, pshuflw, + psrld, psrlq, psrlw, + punpckhbw, punpckhdq, punpckhqdq, punpckhwd, + punpcklbw, punpckldq, punpcklqdq, punpcklwd, + sqrtpd, sqrtsd, subsd, ucomisd, xorpd, + // SSE3 + movddup, movshdup, movsldup, // SSE4.1 - roundss, - roundsd, + roundsd, roundss, + // AVX + vmovddup, vmovshdup, vmovsldup, + vpextrw, vpinsrw, + vpshufhw, vpshuflw, + vpsrld, vpsrlq, vpsrlw, + vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, + vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, // F16C vcvtph2ps, vcvtps2ph, // zig fmt: on @@ -357,7 +352,7 @@ pub const OpEn = enum { fd, td, m1, mc, mi, mr, rm, rmi, mri, mrc, - rrm, rrmi, + vmi, rvm, rvmi, // zig fmt: on }; @@ -372,6 +367,7 @@ pub const Op = enum { cl, r8, r16, r32, r64, rm8, rm16, rm32, rm64, + r32_m16, r64_m16, m8, m16, m32, m64, m80, m128, rel8, rel16, rel32, m, @@ -450,16 +446,49 @@ pub const Op = enum { } } - pub fn bitSize(op: Op) u64 { + pub fn immBitSize(op: Op) u64 { return switch (op) { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, + .al, .cl, .r8, .rm8 => unreachable, + .ax, .r16, .rm16 => unreachable, + .eax, .r32, .rm32, .r32_m16 => unreachable, + .rax, .r64, .rm64, .r64_m16 => unreachable, + .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable, + .m8, .m16, .m32, .m64, .m80, .m128 => unreachable, .unity => 1, - .imm8, .imm8s, .al, .cl, .r8, .m8, .rm8, .rel8 => 8, - .imm16, .imm16s, .ax, .r16, .m16, .rm16, .rel16 => 16, - .imm32, .imm32s, .eax, .r32, .m32, .rm32, .rel32, .xmm_m32 => 32, - .imm64, .rax, .r64, .m64, .rm64, .xmm_m64 => 64, + .imm8, .imm8s, .rel8 => 8, + .imm16, .imm16s, .rel16 => 16, + .imm32, .imm32s, .rel32 => 32, + .imm64 => 64, + }; + } + + pub fn regBitSize(op: Op) u64 { + return switch (op) { + .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, + .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, + .rel8, .rel16, .rel32 => unreachable, + .m8, .m16, .m32, .m64, .m80, .m128 => unreachable, + .al, .cl, .r8, .rm8 => 8, + .ax, .r16, .rm16 => 16, + .eax, .r32, .rm32, .r32_m16 => 32, + .rax, .r64, .rm64, .r64_m16 => 64, + .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, + }; + } + + pub fn memBitSize(op: Op) u64 { + return switch (op) { + .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, + .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, + .rel8, .rel16, .rel32 => unreachable, + .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm => unreachable, + .m8, .rm8 => 8, + .m16, .rm16, .r32_m16, .r64_m16 => 16, + .m32, .rm32, .xmm_m32 => 32, + .m64, .rm64, .xmm_m64 => 64, .m80 => 80, - .m128, .xmm, .xmm_m128 => 128, + .m128, .xmm_m128 => 128, }; } @@ -482,6 +511,7 @@ pub const Op = enum { .al, .ax, .eax, .rax, .r8, .r16, .r32, .r64, .rm8, .rm16, .rm32, .rm64, + .r32_m16, .r64_m16, .xmm, .xmm_m32, .xmm_m64, .xmm_m128, => true, else => false, @@ -506,6 +536,7 @@ pub const Op = enum { // zig fmt: off return switch (op) { .rm8, .rm16, .rm32, .rm64, + .r32_m16, .r64_m16, .m8, .m16, .m32, .m64, .m80, .m128, .m, .xmm_m32, .xmm_m64, .xmm_m128, @@ -528,18 +559,12 @@ pub const Op = enum { .al, .ax, .eax, .rax, .cl => .general_purpose, .r8, .r16, .r32, .r64 => .general_purpose, .rm8, .rm16, .rm32, .rm64 => .general_purpose, + .r32_m16, .r64_m16 => .general_purpose, .sreg => .segment, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point, }; } - pub fn isFloatingPointRegister(op: Op) bool { - return switch (op) { - .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => true, - else => false, - }; - } - /// Given an operand `op` checks if `target` is a subset for the purposes of the encoding. pub fn isSubset(op: Op, target: Op) bool { switch (op) { @@ -553,30 +578,27 @@ pub const Op = enum { if (op.isRegister() and target.isRegister()) { return switch (target) { .cl, .al, .ax, .eax, .rax => op == target, - else => op.class() == target.class() and switch (target.class()) { - .floating_point => true, - else => op.bitSize() == target.bitSize(), - }, + else => op.class() == target.class() and op.regBitSize() == target.regBitSize(), }; } if (op.isMemory() and target.isMemory()) { switch (target) { .m => return true, - else => return op.bitSize() == target.bitSize(), + else => return op.memBitSize() == target.memBitSize(), } } if (op.isImmediate() and target.isImmediate()) { switch (target) { - .imm64 => if (op.bitSize() <= 64) return true, - .imm32s, .rel32 => if (op.bitSize() < 32 or (op.bitSize() == 32 and op.isSigned())) + .imm64 => if (op.immBitSize() <= 64) return true, + .imm32s, .rel32 => if (op.immBitSize() < 32 or (op.immBitSize() == 32 and op.isSigned())) return true, - .imm32 => if (op.bitSize() <= 32) return true, - .imm16s, .rel16 => if (op.bitSize() < 16 or (op.bitSize() == 16 and op.isSigned())) + .imm32 => if (op.immBitSize() <= 32) return true, + .imm16s, .rel16 => if (op.immBitSize() < 16 or (op.immBitSize() == 16 and op.isSigned())) return true, - .imm16 => if (op.bitSize() <= 16) return true, - .imm8s, .rel8 => if (op.bitSize() < 8 or (op.bitSize() == 8 and op.isSigned())) + .imm16 => if (op.immBitSize() <= 16) return true, + .imm8s, .rel8 => if (op.immBitSize() < 8 or (op.immBitSize() == 8 and op.isSigned())) return true, - .imm8 => if (op.bitSize() <= 8) return true, + .imm8 => if (op.immBitSize() <= 8) return true, else => {}, } return op == target; @@ -590,8 +612,9 @@ pub const Op = enum { pub const Mode = enum { none, short, - rex, long, + rex, + rex_short, vex_128, vex_128_long, vex_256, @@ -600,9 +623,11 @@ pub const Mode = enum { pub const Feature = enum { none, + avx, f16c, sse, sse2, + sse3, sse4_1, x87, }; diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 9571f50e7c..d9482d4b39 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -108,12 +108,12 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .orps, .pextrw, .pinsrw, - .roundss, .sqrtps, .sqrtss, .subss, .ucomiss, .xorps, + .addsd, .andnpd, .andpd, @@ -127,13 +127,51 @@ pub fn lowerMir(lower: *Lower, inst: Mir.Inst) Error![]const Instruction { .movsd, .mulsd, .orpd, - .roundsd, + .pshufhw, + .pshuflw, + .psrld, + .psrlq, + .psrlw, + .punpckhbw, + .punpckhdq, + .punpckhqdq, + .punpckhwd, + .punpcklbw, + .punpckldq, + .punpcklqdq, + .punpcklwd, .sqrtpd, .sqrtsd, .subsd, .ucomisd, .xorpd, + .movddup, + .movshdup, + .movsldup, + + .roundsd, + .roundss, + + .vmovddup, + .vmovshdup, + .vmovsldup, + .vpextrw, + .vpinsrw, + .vpshufhw, + .vpshuflw, + .vpsrld, + .vpsrlq, + .vpsrlw, + .vpunpckhbw, + .vpunpckhdq, + .vpunpckhqdq, + .vpunpckhwd, + .vpunpcklbw, + .vpunpckldq, + .vpunpcklqdq, + .vpunpcklwd, + .vcvtph2ps, .vcvtps2ph, => try lower.mirGeneric(inst), diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index c4e19fdc0e..9e39d23bd4 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -196,8 +196,6 @@ pub const Inst = struct { pextrw, /// Insert word pinsrw, - /// Round scalar single-precision floating-point values - roundss, /// Square root of scalar single precision floating-point value sqrtps, /// Subtract scalar single-precision floating-point values @@ -208,6 +206,7 @@ pub const Inst = struct { ucomiss, /// Bitwise logical xor of packed single precision floating-point values xorps, + /// Add double precision floating point values addsd, /// Bitwise logical and not of packed double precision floating-point values @@ -234,8 +233,32 @@ pub const Inst = struct { mulsd, /// Bitwise logical or of packed double precision floating-point values orpd, - /// Round scalar double-precision floating-point values - roundsd, + /// Shuffle packed high words + pshufhw, + /// Shuffle packed low words + pshuflw, + /// Shift packed data right logical + psrld, + /// Shift packed data right logical + psrlq, + /// Shift packed data right logical + psrlw, + /// Unpack high data + punpckhbw, + /// Unpack high data + punpckhdq, + /// Unpack high data + punpckhqdq, + /// Unpack high data + punpckhwd, + /// Unpack low data + punpcklbw, + /// Unpack low data + punpckldq, + /// Unpack low data + punpcklqdq, + /// Unpack low data + punpcklwd, /// Square root of double precision floating-point values sqrtpd, /// Square root of scalar double precision floating-point value @@ -247,6 +270,55 @@ pub const Inst = struct { /// Bitwise logical xor of packed double precision floating-point values xorpd, + /// Replicate double floating-point values + movddup, + /// Replicate single floating-point values + movshdup, + /// Replicate single floating-point values + movsldup, + + /// Round scalar double-precision floating-point values + roundsd, + /// Round scalar single-precision floating-point values + roundss, + + /// Replicate double floating-point values + vmovddup, + /// Replicate single floating-point values + vmovshdup, + /// Replicate single floating-point values + vmovsldup, + /// Extract word + vpextrw, + /// Insert word + vpinsrw, + /// Shuffle packed high words + vpshufhw, + /// Shuffle packed low words + vpshuflw, + /// Shift packed data right logical + vpsrld, + /// Shift packed data right logical + vpsrlq, + /// Shift packed data right logical + vpsrlw, + /// Unpack high data + vpunpckhbw, + /// Unpack high data + vpunpckhdq, + /// Unpack high data + vpunpckhqdq, + /// Unpack high data + vpunpckhwd, + /// Unpack low data + vpunpcklbw, + /// Unpack low data + vpunpckldq, + /// Unpack low data + vpunpcklqdq, + /// Unpack low data + vpunpcklwd, + /// Convert 16-bit floating-point values to single-precision floating-point values vcvtph2ps, /// Convert single-precision floating-point values to 16-bit floating-point values diff --git a/src/arch/x86_64/encoder.zig b/src/arch/x86_64/encoder.zig index 94f4eb56d5..495edb5f2a 100644 --- a/src/arch/x86_64/encoder.zig +++ b/src/arch/x86_64/encoder.zig @@ -151,15 +151,12 @@ pub const Instruction = struct { moffs.offset, }), }, - .imm => |imm| try writer.print("0x{x}", .{imm.asUnsigned(enc_op.bitSize())}), + .imm => |imm| try writer.print("0x{x}", .{imm.asUnsigned(enc_op.immBitSize())}), } } pub fn fmtPrint(op: Operand, enc_op: Encoding.Op) std.fmt.Formatter(fmt) { - return .{ .data = .{ - .op = op, - .enc_op = enc_op, - } }; + return .{ .data = .{ .op = op, .enc_op = enc_op } }; } }; @@ -210,7 +207,7 @@ pub const Instruction = struct { const data = enc.data; switch (data.mode) { - .none, .short, .rex, .long => { + .none, .short, .long, .rex, .rex_short => { try inst.encodeLegacyPrefixes(encoder); try inst.encodeMandatoryPrefix(encoder); try inst.encodeRexPrefix(encoder); @@ -232,15 +229,16 @@ pub const Instruction = struct { else => { const mem_op = switch (data.op_en) { .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0], - .rm, .rmi => inst.ops[1], + .rm, .rmi, .vmi => inst.ops[1], + .rvm, .rvmi => inst.ops[2], else => unreachable, }; switch (mem_op) { .reg => |reg| { const rm = switch (data.op_en) { - .m, .mi, .m1, .mc => enc.modRmExt(), + .m, .mi, .m1, .mc, .vmi => enc.modRmExt(), .mr, .mri, .mrc => inst.ops[1].reg.lowEnc(), - .rm, .rmi => inst.ops[0].reg.lowEnc(), + .rm, .rmi, .rvm, .rvmi => inst.ops[0].reg.lowEnc(), else => unreachable, }; try encoder.modRm_direct(rm, reg.lowEnc()); @@ -259,7 +257,8 @@ pub const Instruction = struct { switch (data.op_en) { .mi => try encodeImm(inst.ops[1].imm, data.ops[1], encoder), - .rmi, .mri => try encodeImm(inst.ops[2].imm, data.ops[2], encoder), + .rmi, .mri, .vmi => try encodeImm(inst.ops[2].imm, data.ops[2], encoder), + .rvmi => try encodeImm(inst.ops[3].imm, data.ops[3], encoder), else => {}, } }, @@ -291,11 +290,9 @@ pub const Instruction = struct { .rep, .repe, .repz => legacy.prefix_f3 = true, } - if (data.mode == .none) { - const bit_size = enc.operandBitSize(); - if (bit_size == 16) { - legacy.set16BitOverride(); - } + switch (data.mode) { + .short, .rex_short => legacy.set16BitOverride(), + else => {}, } const segment_override: ?Register = switch (op_en) { @@ -318,7 +315,7 @@ pub const Instruction = struct { } else null, - .rrm, .rrmi => unreachable, + .vmi, .rvm, .rvmi => unreachable, }; if (segment_override) |seg| { legacy.setSegmentOverride(seg); @@ -353,7 +350,7 @@ pub const Instruction = struct { rex.b = b_x_op.isBaseExtended(); rex.x = b_x_op.isIndexExtended(); }, - .rrm, .rrmi => unreachable, + .vmi, .rvm, .rvmi => unreachable, } try encoder.rex(rex); @@ -375,18 +372,19 @@ pub const Instruction = struct { switch (op_en) { .np, .i, .zi, .fd, .td, .d => {}, .o, .oi => vex.b = inst.ops[0].reg.isExtended(), - .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rrm, .rrmi => { + .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi => { const r_op = switch (op_en) { - .rm, .rmi, .rrm, .rrmi => inst.ops[0], + .rm, .rmi, .rvm, .rvmi => inst.ops[0], .mr, .mri, .mrc => inst.ops[1], - else => .none, + .m, .mi, .m1, .mc, .vmi => .none, + else => unreachable, }; vex.r = r_op.isBaseExtended(); const b_x_op = switch (op_en) { - .rm, .rmi => inst.ops[1], + .rm, .rmi, .vmi => inst.ops[1], .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0], - .rrm, .rrmi => inst.ops[2], + .rvm, .rvmi => inst.ops[2], else => unreachable, }; vex.b = b_x_op.isBaseExtended(); @@ -417,7 +415,8 @@ pub const Instruction = struct { switch (op_en) { else => {}, - .rrm, .rrmi => vex.v = inst.ops[1].reg, + .vmi => vex.v = inst.ops[0].reg, + .rvm, .rvmi => vex.v = inst.ops[1].reg, } try encoder.vex(vex); @@ -515,8 +514,8 @@ pub const Instruction = struct { } fn encodeImm(imm: Immediate, kind: Encoding.Op, encoder: anytype) !void { - const raw = imm.asUnsigned(kind.bitSize()); - switch (kind.bitSize()) { + const raw = imm.asUnsigned(kind.immBitSize()); + switch (kind.immBitSize()) { 8 => try encoder.imm8(@intCast(u8, raw)), 16 => try encoder.imm16(@intCast(u16, raw)), 32 => try encoder.imm32(@intCast(u32, raw)), diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 52b8cc29d6..5d2630e9a8 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -13,264 +13,264 @@ pub const Entry = struct { Mnemonic, OpEn, []const Op, []const u8, modrm_ext, Mo // zig fmt: off pub const table = [_]Entry{ // General-purpose - .{ .adc, .zi, &.{ .al, .imm8 }, &.{ 0x14 }, 0, .none, .none }, - .{ .adc, .zi, &.{ .ax, .imm16 }, &.{ 0x15 }, 0, .none, .none }, - .{ .adc, .zi, &.{ .eax, .imm32 }, &.{ 0x15 }, 0, .none, .none }, - .{ .adc, .zi, &.{ .rax, .imm32s }, &.{ 0x15 }, 0, .long, .none }, - .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .none, .none }, - .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .rex, .none }, - .{ .adc, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 2, .none, .none }, - .{ .adc, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 2, .none, .none }, - .{ .adc, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 2, .long, .none }, - .{ .adc, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 2, .none, .none }, - .{ .adc, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 2, .none, .none }, - .{ .adc, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 2, .long, .none }, - .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .none, .none }, - .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .rex, .none }, - .{ .adc, .mr, &.{ .rm16, .r16 }, &.{ 0x11 }, 0, .none, .none }, - .{ .adc, .mr, &.{ .rm32, .r32 }, &.{ 0x11 }, 0, .none, .none }, - .{ .adc, .mr, &.{ .rm64, .r64 }, &.{ 0x11 }, 0, .long, .none }, - .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .none, .none }, - .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .rex, .none }, - .{ .adc, .rm, &.{ .r16, .rm16 }, &.{ 0x13 }, 0, .none, .none }, - .{ .adc, .rm, &.{ .r32, .rm32 }, &.{ 0x13 }, 0, .none, .none }, - .{ .adc, .rm, &.{ .r64, .rm64 }, &.{ 0x13 }, 0, .long, .none }, - - .{ .add, .zi, &.{ .al, .imm8 }, &.{ 0x04 }, 0, .none, .none }, - .{ .add, .zi, &.{ .ax, .imm16 }, &.{ 0x05 }, 0, .none, .none }, - .{ .add, .zi, &.{ .eax, .imm32 }, &.{ 0x05 }, 0, .none, .none }, - .{ .add, .zi, &.{ .rax, .imm32s }, &.{ 0x05 }, 0, .long, .none }, - .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .none, .none }, - .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .rex, .none }, - .{ .add, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 0, .none, .none }, - .{ .add, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 0, .none, .none }, - .{ .add, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 0, .long, .none }, - .{ .add, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 0, .none, .none }, - .{ .add, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 0, .none, .none }, - .{ .add, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 0, .long, .none }, - .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .none, .none }, - .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .rex, .none }, - .{ .add, .mr, &.{ .rm16, .r16 }, &.{ 0x01 }, 0, .none, .none }, - .{ .add, .mr, &.{ .rm32, .r32 }, &.{ 0x01 }, 0, .none, .none }, - .{ .add, .mr, &.{ .rm64, .r64 }, &.{ 0x01 }, 0, .long, .none }, - .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .none, .none }, - .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .rex, .none }, - .{ .add, .rm, &.{ .r16, .rm16 }, &.{ 0x03 }, 0, .none, .none }, - .{ .add, .rm, &.{ .r32, .rm32 }, &.{ 0x03 }, 0, .none, .none }, - .{ .add, .rm, &.{ .r64, .rm64 }, &.{ 0x03 }, 0, .long, .none }, - - .{ .@"and", .zi, &.{ .al, .imm8 }, &.{ 0x24 }, 0, .none, .none }, - .{ .@"and", .zi, &.{ .ax, .imm16 }, &.{ 0x25 }, 0, .none, .none }, - .{ .@"and", .zi, &.{ .eax, .imm32 }, &.{ 0x25 }, 0, .none, .none }, - .{ .@"and", .zi, &.{ .rax, .imm32s }, &.{ 0x25 }, 0, .long, .none }, - .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .none, .none }, - .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .rex, .none }, - .{ .@"and", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 4, .none, .none }, - .{ .@"and", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 4, .none, .none }, - .{ .@"and", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 4, .long, .none }, - .{ .@"and", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 4, .none, .none }, - .{ .@"and", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 4, .none, .none }, - .{ .@"and", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 4, .long, .none }, - .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .none, .none }, - .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .rex, .none }, - .{ .@"and", .mr, &.{ .rm16, .r16 }, &.{ 0x21 }, 0, .none, .none }, - .{ .@"and", .mr, &.{ .rm32, .r32 }, &.{ 0x21 }, 0, .none, .none }, - .{ .@"and", .mr, &.{ .rm64, .r64 }, &.{ 0x21 }, 0, .long, .none }, - .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .none, .none }, - .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .rex, .none }, - .{ .@"and", .rm, &.{ .r16, .rm16 }, &.{ 0x23 }, 0, .none, .none }, - .{ .@"and", .rm, &.{ .r32, .rm32 }, &.{ 0x23 }, 0, .none, .none }, - .{ .@"and", .rm, &.{ .r64, .rm64 }, &.{ 0x23 }, 0, .long, .none }, - - .{ .bsf, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbc }, 0, .none, .none }, - .{ .bsf, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbc }, 0, .none, .none }, - .{ .bsf, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbc }, 0, .long, .none }, - - .{ .bsr, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbd }, 0, .none, .none }, - .{ .bsr, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbd }, 0, .none, .none }, - .{ .bsr, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbd }, 0, .long, .none }, + .{ .adc, .zi, &.{ .al, .imm8 }, &.{ 0x14 }, 0, .none, .none }, + .{ .adc, .zi, &.{ .ax, .imm16 }, &.{ 0x15 }, 0, .short, .none }, + .{ .adc, .zi, &.{ .eax, .imm32 }, &.{ 0x15 }, 0, .none, .none }, + .{ .adc, .zi, &.{ .rax, .imm32s }, &.{ 0x15 }, 0, .long, .none }, + .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 2, .rex, .none }, + .{ .adc, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 2, .short, .none }, + .{ .adc, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 2, .long, .none }, + .{ .adc, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 2, .short, .none }, + .{ .adc, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 2, .none, .none }, + .{ .adc, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 2, .long, .none }, + .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .none, .none }, + .{ .adc, .mr, &.{ .rm8, .r8 }, &.{ 0x10 }, 0, .rex, .none }, + .{ .adc, .mr, &.{ .rm16, .r16 }, &.{ 0x11 }, 0, .short, .none }, + .{ .adc, .mr, &.{ .rm32, .r32 }, &.{ 0x11 }, 0, .none, .none }, + .{ .adc, .mr, &.{ .rm64, .r64 }, &.{ 0x11 }, 0, .long, .none }, + .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .none, .none }, + .{ .adc, .rm, &.{ .r8, .rm8 }, &.{ 0x12 }, 0, .rex, .none }, + .{ .adc, .rm, &.{ .r16, .rm16 }, &.{ 0x13 }, 0, .short, .none }, + .{ .adc, .rm, &.{ .r32, .rm32 }, &.{ 0x13 }, 0, .none, .none }, + .{ .adc, .rm, &.{ .r64, .rm64 }, &.{ 0x13 }, 0, .long, .none }, + + .{ .add, .zi, &.{ .al, .imm8 }, &.{ 0x04 }, 0, .none, .none }, + .{ .add, .zi, &.{ .ax, .imm16 }, &.{ 0x05 }, 0, .short, .none }, + .{ .add, .zi, &.{ .eax, .imm32 }, &.{ 0x05 }, 0, .none, .none }, + .{ .add, .zi, &.{ .rax, .imm32s }, &.{ 0x05 }, 0, .long, .none }, + .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 0, .rex, .none }, + .{ .add, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 0, .short, .none }, + .{ .add, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 0, .long, .none }, + .{ .add, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 0, .short, .none }, + .{ .add, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 0, .none, .none }, + .{ .add, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 0, .long, .none }, + .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .none, .none }, + .{ .add, .mr, &.{ .rm8, .r8 }, &.{ 0x00 }, 0, .rex, .none }, + .{ .add, .mr, &.{ .rm16, .r16 }, &.{ 0x01 }, 0, .short, .none }, + .{ .add, .mr, &.{ .rm32, .r32 }, &.{ 0x01 }, 0, .none, .none }, + .{ .add, .mr, &.{ .rm64, .r64 }, &.{ 0x01 }, 0, .long, .none }, + .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .none, .none }, + .{ .add, .rm, &.{ .r8, .rm8 }, &.{ 0x02 }, 0, .rex, .none }, + .{ .add, .rm, &.{ .r16, .rm16 }, &.{ 0x03 }, 0, .short, .none }, + .{ .add, .rm, &.{ .r32, .rm32 }, &.{ 0x03 }, 0, .none, .none }, + .{ .add, .rm, &.{ .r64, .rm64 }, &.{ 0x03 }, 0, .long, .none }, + + .{ .@"and", .zi, &.{ .al, .imm8 }, &.{ 0x24 }, 0, .none, .none }, + .{ .@"and", .zi, &.{ .ax, .imm16 }, &.{ 0x25 }, 0, .short, .none }, + .{ .@"and", .zi, &.{ .eax, .imm32 }, &.{ 0x25 }, 0, .none, .none }, + .{ .@"and", .zi, &.{ .rax, .imm32s }, &.{ 0x25 }, 0, .long, .none }, + .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 4, .rex, .none }, + .{ .@"and", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 4, .short, .none }, + .{ .@"and", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 4, .long, .none }, + .{ .@"and", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 4, .short, .none }, + .{ .@"and", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 4, .none, .none }, + .{ .@"and", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 4, .long, .none }, + .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .none, .none }, + .{ .@"and", .mr, &.{ .rm8, .r8 }, &.{ 0x20 }, 0, .rex, .none }, + .{ .@"and", .mr, &.{ .rm16, .r16 }, &.{ 0x21 }, 0, .short, .none }, + .{ .@"and", .mr, &.{ .rm32, .r32 }, &.{ 0x21 }, 0, .none, .none }, + .{ .@"and", .mr, &.{ .rm64, .r64 }, &.{ 0x21 }, 0, .long, .none }, + .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .none, .none }, + .{ .@"and", .rm, &.{ .r8, .rm8 }, &.{ 0x22 }, 0, .rex, .none }, + .{ .@"and", .rm, &.{ .r16, .rm16 }, &.{ 0x23 }, 0, .short, .none }, + .{ .@"and", .rm, &.{ .r32, .rm32 }, &.{ 0x23 }, 0, .none, .none }, + .{ .@"and", .rm, &.{ .r64, .rm64 }, &.{ 0x23 }, 0, .long, .none }, + + .{ .bsf, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbc }, 0, .short, .none }, + .{ .bsf, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbc }, 0, .none, .none }, + .{ .bsf, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbc }, 0, .long, .none }, + + .{ .bsr, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0xbd }, 0, .short, .none }, + .{ .bsr, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0xbd }, 0, .none, .none }, + .{ .bsr, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0xbd }, 0, .long, .none }, .{ .bswap, .o, &.{ .r32 }, &.{ 0x0f, 0xc8 }, 0, .none, .none }, .{ .bswap, .o, &.{ .r64 }, &.{ 0x0f, 0xc8 }, 0, .long, .none }, - .{ .bt, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xa3 }, 0, .none, .none }, - .{ .bt, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xa3 }, 0, .none, .none }, - .{ .bt, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xa3 }, 0, .long, .none }, - .{ .bt, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 4, .none, .none }, - .{ .bt, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 4, .none, .none }, - .{ .bt, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 4, .long, .none }, - - .{ .btc, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xbb }, 0, .none, .none }, - .{ .btc, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xbb }, 0, .none, .none }, - .{ .btc, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xbb }, 0, .long, .none }, - .{ .btc, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 7, .none, .none }, - .{ .btc, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 7, .none, .none }, - .{ .btc, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 7, .long, .none }, - - .{ .btr, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb3 }, 0, .none, .none }, - .{ .btr, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb3 }, 0, .none, .none }, - .{ .btr, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb3 }, 0, .long, .none }, - .{ .btr, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 6, .none, .none }, - .{ .btr, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 6, .none, .none }, - .{ .btr, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 6, .long, .none }, - - .{ .bts, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xab }, 0, .none, .none }, - .{ .bts, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xab }, 0, .none, .none }, - .{ .bts, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xab }, 0, .long, .none }, - .{ .bts, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 5, .none, .none }, - .{ .bts, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 5, .none, .none }, - .{ .bts, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 5, .long, .none }, + .{ .bt, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xa3 }, 0, .short, .none }, + .{ .bt, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xa3 }, 0, .none, .none }, + .{ .bt, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xa3 }, 0, .long, .none }, + .{ .bt, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 4, .short, .none }, + .{ .bt, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 4, .none, .none }, + .{ .bt, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 4, .long, .none }, + + .{ .btc, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xbb }, 0, .short, .none }, + .{ .btc, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xbb }, 0, .none, .none }, + .{ .btc, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xbb }, 0, .long, .none }, + .{ .btc, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 7, .short, .none }, + .{ .btc, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 7, .none, .none }, + .{ .btc, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 7, .long, .none }, + + .{ .btr, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb3 }, 0, .short, .none }, + .{ .btr, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb3 }, 0, .none, .none }, + .{ .btr, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb3 }, 0, .long, .none }, + .{ .btr, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 6, .short, .none }, + .{ .btr, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 6, .none, .none }, + .{ .btr, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 6, .long, .none }, + + .{ .bts, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xab }, 0, .short, .none }, + .{ .bts, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xab }, 0, .none, .none }, + .{ .bts, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xab }, 0, .long, .none }, + .{ .bts, .mi, &.{ .rm16, .imm8 }, &.{ 0x0f, 0xba }, 5, .short, .none }, + .{ .bts, .mi, &.{ .rm32, .imm8 }, &.{ 0x0f, 0xba }, 5, .none, .none }, + .{ .bts, .mi, &.{ .rm64, .imm8 }, &.{ 0x0f, 0xba }, 5, .long, .none }, // This is M encoding according to Intel, but D makes more sense here. .{ .call, .d, &.{ .rel32 }, &.{ 0xe8 }, 0, .none, .none }, .{ .call, .m, &.{ .rm64 }, &.{ 0xff }, 2, .none, .none }, - .{ .cbw, .np, &.{ .o16 }, &.{ 0x98 }, 0, .none, .none }, - .{ .cwde, .np, &.{ .o32 }, &.{ 0x98 }, 0, .none, .none }, - .{ .cdqe, .np, &.{ .o64 }, &.{ 0x98 }, 0, .long, .none }, - - .{ .cwd, .np, &.{ .o16 }, &.{ 0x99 }, 0, .none, .none }, - .{ .cdq, .np, &.{ .o32 }, &.{ 0x99 }, 0, .none, .none }, - .{ .cqo, .np, &.{ .o64 }, &.{ 0x99 }, 0, .long, .none }, - - .{ .cmova, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, - .{ .cmova, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, - .{ .cmova, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long, .none }, - .{ .cmovae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, - .{ .cmovae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, - .{ .cmovae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, - .{ .cmovb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, - .{ .cmovb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, - .{ .cmovb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, - .{ .cmovbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, - .{ .cmovbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, - .{ .cmovbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long, .none }, - .{ .cmovc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, - .{ .cmovc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, - .{ .cmovc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, - .{ .cmove, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, - .{ .cmove, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, - .{ .cmove, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long, .none }, - .{ .cmovg, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, - .{ .cmovg, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, - .{ .cmovg, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long, .none }, - .{ .cmovge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, - .{ .cmovge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, - .{ .cmovge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long, .none }, - .{ .cmovl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, - .{ .cmovl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, - .{ .cmovl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long, .none }, - .{ .cmovle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, - .{ .cmovle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, - .{ .cmovle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long, .none }, - .{ .cmovna, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, - .{ .cmovna, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, - .{ .cmovna, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long, .none }, - .{ .cmovnae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, - .{ .cmovnae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, - .{ .cmovnae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, - .{ .cmovnb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, - .{ .cmovnb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, - .{ .cmovnb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, - .{ .cmovnbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, - .{ .cmovnbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, - .{ .cmovnbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long, .none }, - .{ .cmovnc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, - .{ .cmovnc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, - .{ .cmovnc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, - .{ .cmovne, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, - .{ .cmovne, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, - .{ .cmovne, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long, .none }, - .{ .cmovng, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, - .{ .cmovng, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, - .{ .cmovng, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long, .none }, - .{ .cmovnge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, - .{ .cmovnge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, - .{ .cmovnge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long, .none }, - .{ .cmovnl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, - .{ .cmovnl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, - .{ .cmovnl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long, .none }, - .{ .cmovnle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, - .{ .cmovnle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, - .{ .cmovnle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long, .none }, - .{ .cmovno, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x41 }, 0, .none, .none }, - .{ .cmovno, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x41 }, 0, .none, .none }, - .{ .cmovno, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x41 }, 0, .long, .none }, - .{ .cmovnp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, - .{ .cmovnp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, - .{ .cmovnp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long, .none }, - .{ .cmovns, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x49 }, 0, .none, .none }, - .{ .cmovns, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x49 }, 0, .none, .none }, - .{ .cmovns, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x49 }, 0, .long, .none }, - .{ .cmovnz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, - .{ .cmovnz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, - .{ .cmovnz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long, .none }, - .{ .cmovo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x40 }, 0, .none, .none }, - .{ .cmovo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x40 }, 0, .none, .none }, - .{ .cmovo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x40 }, 0, .long, .none }, - .{ .cmovp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, - .{ .cmovp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, - .{ .cmovp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long, .none }, - .{ .cmovpe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, - .{ .cmovpe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, - .{ .cmovpe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long, .none }, - .{ .cmovpo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, - .{ .cmovpo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, - .{ .cmovpo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long, .none }, - .{ .cmovs, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x48 }, 0, .none, .none }, - .{ .cmovs, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x48 }, 0, .none, .none }, - .{ .cmovs, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x48 }, 0, .long, .none }, - .{ .cmovz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, - .{ .cmovz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, - .{ .cmovz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long, .none }, - - .{ .cmp, .zi, &.{ .al, .imm8 }, &.{ 0x3c }, 0, .none, .none }, - .{ .cmp, .zi, &.{ .ax, .imm16 }, &.{ 0x3d }, 0, .none, .none }, - .{ .cmp, .zi, &.{ .eax, .imm32 }, &.{ 0x3d }, 0, .none, .none }, - .{ .cmp, .zi, &.{ .rax, .imm32s }, &.{ 0x3d }, 0, .long, .none }, - .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .none, .none }, - .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .rex, .none }, - .{ .cmp, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 7, .none, .none }, - .{ .cmp, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 7, .none, .none }, - .{ .cmp, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 7, .long, .none }, - .{ .cmp, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 7, .none, .none }, - .{ .cmp, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 7, .none, .none }, - .{ .cmp, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 7, .long, .none }, - .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .none, .none }, - .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .rex, .none }, - .{ .cmp, .mr, &.{ .rm16, .r16 }, &.{ 0x39 }, 0, .none, .none }, - .{ .cmp, .mr, &.{ .rm32, .r32 }, &.{ 0x39 }, 0, .none, .none }, - .{ .cmp, .mr, &.{ .rm64, .r64 }, &.{ 0x39 }, 0, .long, .none }, - .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .none, .none }, - .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .rex, .none }, - .{ .cmp, .rm, &.{ .r16, .rm16 }, &.{ 0x3b }, 0, .none, .none }, - .{ .cmp, .rm, &.{ .r32, .rm32 }, &.{ 0x3b }, 0, .none, .none }, - .{ .cmp, .rm, &.{ .r64, .rm64 }, &.{ 0x3b }, 0, .long, .none }, - - .{ .cmps, .np, &.{ .m8, .m8 }, &.{ 0xa6 }, 0, .none, .none }, - .{ .cmps, .np, &.{ .m16, .m16 }, &.{ 0xa7 }, 0, .none, .none }, - .{ .cmps, .np, &.{ .m32, .m32 }, &.{ 0xa7 }, 0, .none, .none }, - .{ .cmps, .np, &.{ .m64, .m64 }, &.{ 0xa7 }, 0, .long, .none }, + .{ .cbw, .np, &.{ .o16 }, &.{ 0x98 }, 0, .short, .none }, + .{ .cwde, .np, &.{ .o32 }, &.{ 0x98 }, 0, .none, .none }, + .{ .cdqe, .np, &.{ .o64 }, &.{ 0x98 }, 0, .long, .none }, + + .{ .cwd, .np, &.{ .o16 }, &.{ 0x99 }, 0, .short, .none }, + .{ .cdq, .np, &.{ .o32 }, &.{ 0x99 }, 0, .none, .none }, + .{ .cqo, .np, &.{ .o64 }, &.{ 0x99 }, 0, .long, .none }, + + .{ .cmova, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .short, .none }, + .{ .cmova, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, + .{ .cmova, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long, .none }, + .{ .cmovae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .none }, + .{ .cmovae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, + .{ .cmovb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .none }, + .{ .cmovb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, + .{ .cmovbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .short, .none }, + .{ .cmovbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, + .{ .cmovbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long, .none }, + .{ .cmovc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .none }, + .{ .cmovc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, + .{ .cmove, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .short, .none }, + .{ .cmove, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, + .{ .cmove, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long, .none }, + .{ .cmovg, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .short, .none }, + .{ .cmovg, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, + .{ .cmovg, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long, .none }, + .{ .cmovge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .short, .none }, + .{ .cmovge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, + .{ .cmovge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long, .none }, + .{ .cmovl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .short, .none }, + .{ .cmovl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, + .{ .cmovl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long, .none }, + .{ .cmovle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .short, .none }, + .{ .cmovle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, + .{ .cmovle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long, .none }, + .{ .cmovna, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x46 }, 0, .short, .none }, + .{ .cmovna, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x46 }, 0, .none, .none }, + .{ .cmovna, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x46 }, 0, .long, .none }, + .{ .cmovnae, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x42 }, 0, .short, .none }, + .{ .cmovnae, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x42 }, 0, .none, .none }, + .{ .cmovnae, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x42 }, 0, .long, .none }, + .{ .cmovnb, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .none }, + .{ .cmovnb, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovnb, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, + .{ .cmovnbe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x47 }, 0, .short, .none }, + .{ .cmovnbe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x47 }, 0, .none, .none }, + .{ .cmovnbe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x47 }, 0, .long, .none }, + .{ .cmovnc, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x43 }, 0, .short, .none }, + .{ .cmovnc, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x43 }, 0, .none, .none }, + .{ .cmovnc, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x43 }, 0, .long, .none }, + .{ .cmovne, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .short, .none }, + .{ .cmovne, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, + .{ .cmovne, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long, .none }, + .{ .cmovng, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4e }, 0, .short, .none }, + .{ .cmovng, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4e }, 0, .none, .none }, + .{ .cmovng, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4e }, 0, .long, .none }, + .{ .cmovnge, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4c }, 0, .short, .none }, + .{ .cmovnge, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4c }, 0, .none, .none }, + .{ .cmovnge, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4c }, 0, .long, .none }, + .{ .cmovnl, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4d }, 0, .short, .none }, + .{ .cmovnl, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4d }, 0, .none, .none }, + .{ .cmovnl, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4d }, 0, .long, .none }, + .{ .cmovnle, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4f }, 0, .short, .none }, + .{ .cmovnle, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4f }, 0, .none, .none }, + .{ .cmovnle, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4f }, 0, .long, .none }, + .{ .cmovno, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x41 }, 0, .short, .none }, + .{ .cmovno, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x41 }, 0, .none, .none }, + .{ .cmovno, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x41 }, 0, .long, .none }, + .{ .cmovnp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .short, .none }, + .{ .cmovnp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, + .{ .cmovnp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long, .none }, + .{ .cmovns, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x49 }, 0, .short, .none }, + .{ .cmovns, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x49 }, 0, .none, .none }, + .{ .cmovns, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x49 }, 0, .long, .none }, + .{ .cmovnz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x45 }, 0, .short, .none }, + .{ .cmovnz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x45 }, 0, .none, .none }, + .{ .cmovnz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x45 }, 0, .long, .none }, + .{ .cmovo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x40 }, 0, .short, .none }, + .{ .cmovo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x40 }, 0, .none, .none }, + .{ .cmovo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x40 }, 0, .long, .none }, + .{ .cmovp, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .short, .none }, + .{ .cmovp, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, + .{ .cmovp, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long, .none }, + .{ .cmovpe, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4a }, 0, .short, .none }, + .{ .cmovpe, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4a }, 0, .none, .none }, + .{ .cmovpe, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4a }, 0, .long, .none }, + .{ .cmovpo, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x4b }, 0, .short, .none }, + .{ .cmovpo, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x4b }, 0, .none, .none }, + .{ .cmovpo, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x4b }, 0, .long, .none }, + .{ .cmovs, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x48 }, 0, .short, .none }, + .{ .cmovs, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x48 }, 0, .none, .none }, + .{ .cmovs, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x48 }, 0, .long, .none }, + .{ .cmovz, .rm, &.{ .r16, .rm16 }, &.{ 0x0f, 0x44 }, 0, .short, .none }, + .{ .cmovz, .rm, &.{ .r32, .rm32 }, &.{ 0x0f, 0x44 }, 0, .none, .none }, + .{ .cmovz, .rm, &.{ .r64, .rm64 }, &.{ 0x0f, 0x44 }, 0, .long, .none }, + + .{ .cmp, .zi, &.{ .al, .imm8 }, &.{ 0x3c }, 0, .none, .none }, + .{ .cmp, .zi, &.{ .ax, .imm16 }, &.{ 0x3d }, 0, .short, .none }, + .{ .cmp, .zi, &.{ .eax, .imm32 }, &.{ 0x3d }, 0, .none, .none }, + .{ .cmp, .zi, &.{ .rax, .imm32s }, &.{ 0x3d }, 0, .long, .none }, + .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 7, .rex, .none }, + .{ .cmp, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 7, .short, .none }, + .{ .cmp, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 7, .long, .none }, + .{ .cmp, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 7, .short, .none }, + .{ .cmp, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 7, .none, .none }, + .{ .cmp, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 7, .long, .none }, + .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .none, .none }, + .{ .cmp, .mr, &.{ .rm8, .r8 }, &.{ 0x38 }, 0, .rex, .none }, + .{ .cmp, .mr, &.{ .rm16, .r16 }, &.{ 0x39 }, 0, .short, .none }, + .{ .cmp, .mr, &.{ .rm32, .r32 }, &.{ 0x39 }, 0, .none, .none }, + .{ .cmp, .mr, &.{ .rm64, .r64 }, &.{ 0x39 }, 0, .long, .none }, + .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .none, .none }, + .{ .cmp, .rm, &.{ .r8, .rm8 }, &.{ 0x3a }, 0, .rex, .none }, + .{ .cmp, .rm, &.{ .r16, .rm16 }, &.{ 0x3b }, 0, .short, .none }, + .{ .cmp, .rm, &.{ .r32, .rm32 }, &.{ 0x3b }, 0, .none, .none }, + .{ .cmp, .rm, &.{ .r64, .rm64 }, &.{ 0x3b }, 0, .long, .none }, + + .{ .cmps, .np, &.{ .m8, .m8 }, &.{ 0xa6 }, 0, .none, .none }, + .{ .cmps, .np, &.{ .m16, .m16 }, &.{ 0xa7 }, 0, .short, .none }, + .{ .cmps, .np, &.{ .m32, .m32 }, &.{ 0xa7 }, 0, .none, .none }, + .{ .cmps, .np, &.{ .m64, .m64 }, &.{ 0xa7 }, 0, .long, .none }, .{ .cmpsb, .np, &.{}, &.{ 0xa6 }, 0, .none, .none }, .{ .cmpsw, .np, &.{}, &.{ 0xa7 }, 0, .short, .none }, .{ .cmpsd, .np, &.{}, &.{ 0xa7 }, 0, .none, .none }, .{ .cmpsq, .np, &.{}, &.{ 0xa7 }, 0, .long, .none }, - .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .none, .none }, - .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .rex, .none }, - .{ .cmpxchg, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb1 }, 0, .none, .none }, - .{ .cmpxchg, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb1 }, 0, .none, .none }, - .{ .cmpxchg, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb1 }, 0, .long, .none }, + .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .none, .none }, + .{ .cmpxchg, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xb0 }, 0, .rex, .none }, + .{ .cmpxchg, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xb1 }, 0, .short, .none }, + .{ .cmpxchg, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xb1 }, 0, .none, .none }, + .{ .cmpxchg, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xb1 }, 0, .long, .none }, .{ .cmpxchg8b, .m, &.{ .m64 }, &.{ 0x0f, 0xc7 }, 1, .none, .none }, .{ .cmpxchg16b, .m, &.{ .m128 }, &.{ 0x0f, 0xc7 }, 1, .long, .none }, - .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .none, .none }, - .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .rex, .none }, - .{ .div, .m, &.{ .rm16 }, &.{ 0xf7 }, 6, .none, .none }, - .{ .div, .m, &.{ .rm32 }, &.{ 0xf7 }, 6, .none, .none }, - .{ .div, .m, &.{ .rm64 }, &.{ 0xf7 }, 6, .long, .none }, + .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .none, .none }, + .{ .div, .m, &.{ .rm8 }, &.{ 0xf6 }, 6, .rex, .none }, + .{ .div, .m, &.{ .rm16 }, &.{ 0xf7 }, 6, .short, .none }, + .{ .div, .m, &.{ .rm32 }, &.{ 0xf7 }, 6, .none, .none }, + .{ .div, .m, &.{ .rm64 }, &.{ 0xf7 }, 6, .long, .none }, .{ .fisttp, .m, &.{ .m16 }, &.{ 0xdf }, 1, .none, .x87 }, .{ .fisttp, .m, &.{ .m32 }, &.{ 0xdb }, 1, .none, .x87 }, @@ -280,26 +280,26 @@ pub const table = [_]Entry{ .{ .fld, .m, &.{ .m64 }, &.{ 0xdd }, 0, .none, .x87 }, .{ .fld, .m, &.{ .m80 }, &.{ 0xdb }, 5, .none, .x87 }, - .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .none, .none }, - .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .rex, .none }, - .{ .idiv, .m, &.{ .rm16 }, &.{ 0xf7 }, 7, .none, .none }, - .{ .idiv, .m, &.{ .rm32 }, &.{ 0xf7 }, 7, .none, .none }, - .{ .idiv, .m, &.{ .rm64 }, &.{ 0xf7 }, 7, .long, .none }, - - .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .none, .none }, - .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .rex, .none }, - .{ .imul, .m, &.{ .rm16, }, &.{ 0xf7 }, 5, .none, .none }, - .{ .imul, .m, &.{ .rm32, }, &.{ 0xf7 }, 5, .none, .none }, - .{ .imul, .m, &.{ .rm64, }, &.{ 0xf7 }, 5, .long, .none }, - .{ .imul, .rm, &.{ .r16, .rm16, }, &.{ 0x0f, 0xaf }, 0, .none, .none }, - .{ .imul, .rm, &.{ .r32, .rm32, }, &.{ 0x0f, 0xaf }, 0, .none, .none }, - .{ .imul, .rm, &.{ .r64, .rm64, }, &.{ 0x0f, 0xaf }, 0, .long, .none }, - .{ .imul, .rmi, &.{ .r16, .rm16, .imm8s }, &.{ 0x6b }, 0, .none, .none }, - .{ .imul, .rmi, &.{ .r32, .rm32, .imm8s }, &.{ 0x6b }, 0, .none, .none }, - .{ .imul, .rmi, &.{ .r64, .rm64, .imm8s }, &.{ 0x6b }, 0, .long, .none }, - .{ .imul, .rmi, &.{ .r16, .rm16, .imm16 }, &.{ 0x69 }, 0, .none, .none }, - .{ .imul, .rmi, &.{ .r32, .rm32, .imm32 }, &.{ 0x69 }, 0, .none, .none }, - .{ .imul, .rmi, &.{ .r64, .rm64, .imm32 }, &.{ 0x69 }, 0, .long, .none }, + .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .none, .none }, + .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .rex, .none }, + .{ .idiv, .m, &.{ .rm16 }, &.{ 0xf7 }, 7, .short, .none }, + .{ .idiv, .m, &.{ .rm32 }, &.{ 0xf7 }, 7, .none, .none }, + .{ .idiv, .m, &.{ .rm64 }, &.{ 0xf7 }, 7, .long, .none }, + + .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .none, .none }, + .{ .imul, .m, &.{ .rm8 }, &.{ 0xf6 }, 5, .rex, .none }, + .{ .imul, .m, &.{ .rm16, }, &.{ 0xf7 }, 5, .short, .none }, + .{ .imul, .m, &.{ .rm32, }, &.{ 0xf7 }, 5, .none, .none }, + .{ .imul, .m, &.{ .rm64, }, &.{ 0xf7 }, 5, .long, .none }, + .{ .imul, .rm, &.{ .r16, .rm16, }, &.{ 0x0f, 0xaf }, 0, .short, .none }, + .{ .imul, .rm, &.{ .r32, .rm32, }, &.{ 0x0f, 0xaf }, 0, .none, .none }, + .{ .imul, .rm, &.{ .r64, .rm64, }, &.{ 0x0f, 0xaf }, 0, .long, .none }, + .{ .imul, .rmi, &.{ .r16, .rm16, .imm8s }, &.{ 0x6b }, 0, .short, .none }, + .{ .imul, .rmi, &.{ .r32, .rm32, .imm8s }, &.{ 0x6b }, 0, .none, .none }, + .{ .imul, .rmi, &.{ .r64, .rm64, .imm8s }, &.{ 0x6b }, 0, .long, .none }, + .{ .imul, .rmi, &.{ .r16, .rm16, .imm16 }, &.{ 0x69 }, 0, .short, .none }, + .{ .imul, .rmi, &.{ .r32, .rm32, .imm32 }, &.{ 0x69 }, 0, .none, .none }, + .{ .imul, .rmi, &.{ .r64, .rm64, .imm32 }, &.{ 0x69 }, 0, .long, .none }, .{ .int3, .np, &.{}, &.{ 0xcc }, 0, .none, .none }, @@ -338,281 +338,283 @@ pub const table = [_]Entry{ .{ .jmp, .d, &.{ .rel32 }, &.{ 0xe9 }, 0, .none, .none }, .{ .jmp, .m, &.{ .rm64 }, &.{ 0xff }, 4, .none, .none }, - .{ .lea, .rm, &.{ .r16, .m }, &.{ 0x8d }, 0, .none, .none }, - .{ .lea, .rm, &.{ .r32, .m }, &.{ 0x8d }, 0, .none, .none }, - .{ .lea, .rm, &.{ .r64, .m }, &.{ 0x8d }, 0, .long, .none }, + .{ .lea, .rm, &.{ .r16, .m }, &.{ 0x8d }, 0, .short, .none }, + .{ .lea, .rm, &.{ .r32, .m }, &.{ 0x8d }, 0, .none, .none }, + .{ .lea, .rm, &.{ .r64, .m }, &.{ 0x8d }, 0, .long, .none }, .{ .lfence, .np, &.{}, &.{ 0x0f, 0xae, 0xe8 }, 0, .none, .none }, - .{ .lods, .np, &.{ .m8 }, &.{ 0xac }, 0, .none, .none }, - .{ .lods, .np, &.{ .m16 }, &.{ 0xad }, 0, .none, .none }, - .{ .lods, .np, &.{ .m32 }, &.{ 0xad }, 0, .none, .none }, - .{ .lods, .np, &.{ .m64 }, &.{ 0xad }, 0, .long, .none }, + .{ .lods, .np, &.{ .m8 }, &.{ 0xac }, 0, .none, .none }, + .{ .lods, .np, &.{ .m16 }, &.{ 0xad }, 0, .short, .none }, + .{ .lods, .np, &.{ .m32 }, &.{ 0xad }, 0, .none, .none }, + .{ .lods, .np, &.{ .m64 }, &.{ 0xad }, 0, .long, .none }, .{ .lodsb, .np, &.{}, &.{ 0xac }, 0, .none, .none }, .{ .lodsw, .np, &.{}, &.{ 0xad }, 0, .short, .none }, .{ .lodsd, .np, &.{}, &.{ 0xad }, 0, .none, .none }, .{ .lodsq, .np, &.{}, &.{ 0xad }, 0, .long, .none }, - .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .none }, - .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .none }, - .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long, .none }, + .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .short, .none }, + .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .none }, + .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long, .none }, .{ .mfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf0 }, 0, .none, .none }, - .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .none, .none }, - .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .rex, .none }, - .{ .mov, .mr, &.{ .rm16, .r16 }, &.{ 0x89 }, 0, .none, .none }, - .{ .mov, .mr, &.{ .rm32, .r32 }, &.{ 0x89 }, 0, .none, .none }, - .{ .mov, .mr, &.{ .rm64, .r64 }, &.{ 0x89 }, 0, .long, .none }, - .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .none, .none }, - .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .rex, .none }, - .{ .mov, .rm, &.{ .r16, .rm16 }, &.{ 0x8b }, 0, .none, .none }, - .{ .mov, .rm, &.{ .r32, .rm32 }, &.{ 0x8b }, 0, .none, .none }, - .{ .mov, .rm, &.{ .r64, .rm64 }, &.{ 0x8b }, 0, .long, .none }, - .{ .mov, .mr, &.{ .rm16, .sreg }, &.{ 0x8c }, 0, .none, .none }, - .{ .mov, .mr, &.{ .rm64, .sreg }, &.{ 0x8c }, 0, .long, .none }, - .{ .mov, .rm, &.{ .sreg, .rm16 }, &.{ 0x8e }, 0, .none, .none }, - .{ .mov, .rm, &.{ .sreg, .rm64 }, &.{ 0x8e }, 0, .long, .none }, - .{ .mov, .fd, &.{ .al, .moffs }, &.{ 0xa0 }, 0, .none, .none }, - .{ .mov, .fd, &.{ .ax, .moffs }, &.{ 0xa1 }, 0, .none, .none }, - .{ .mov, .fd, &.{ .eax, .moffs }, &.{ 0xa1 }, 0, .none, .none }, - .{ .mov, .fd, &.{ .rax, .moffs }, &.{ 0xa1 }, 0, .long, .none }, - .{ .mov, .td, &.{ .moffs, .al }, &.{ 0xa2 }, 0, .none, .none }, - .{ .mov, .td, &.{ .moffs, .ax }, &.{ 0xa3 }, 0, .none, .none }, - .{ .mov, .td, &.{ .moffs, .eax }, &.{ 0xa3 }, 0, .none, .none }, - .{ .mov, .td, &.{ .moffs, .rax }, &.{ 0xa3 }, 0, .long, .none }, - .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .none, .none }, - .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .rex, .none }, - .{ .mov, .oi, &.{ .r16, .imm16 }, &.{ 0xb8 }, 0, .none, .none }, - .{ .mov, .oi, &.{ .r32, .imm32 }, &.{ 0xb8 }, 0, .none, .none }, - .{ .mov, .oi, &.{ .r64, .imm64 }, &.{ 0xb8 }, 0, .long, .none }, - .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .none, .none }, - .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .rex, .none }, - .{ .mov, .mi, &.{ .rm16, .imm16 }, &.{ 0xc7 }, 0, .none, .none }, - .{ .mov, .mi, &.{ .rm32, .imm32 }, &.{ 0xc7 }, 0, .none, .none }, - .{ .mov, .mi, &.{ .rm64, .imm32s }, &.{ 0xc7 }, 0, .long, .none }, - - .{ .movbe, .rm, &.{ .r16, .m16 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .none }, - .{ .movbe, .rm, &.{ .r32, .m32 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .none }, - .{ .movbe, .rm, &.{ .r64, .m64 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .long, .none }, - .{ .movbe, .mr, &.{ .m16, .r16 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .none }, - .{ .movbe, .mr, &.{ .m32, .r32 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .none }, - .{ .movbe, .mr, &.{ .m64, .r64 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .long, .none }, - - .{ .movs, .np, &.{ .m8, .m8 }, &.{ 0xa4 }, 0, .none, .none }, - .{ .movs, .np, &.{ .m16, .m16 }, &.{ 0xa5 }, 0, .none, .none }, - .{ .movs, .np, &.{ .m32, .m32 }, &.{ 0xa5 }, 0, .none, .none }, - .{ .movs, .np, &.{ .m64, .m64 }, &.{ 0xa5 }, 0, .long, .none }, + .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .none, .none }, + .{ .mov, .mr, &.{ .rm8, .r8 }, &.{ 0x88 }, 0, .rex, .none }, + .{ .mov, .mr, &.{ .rm16, .r16 }, &.{ 0x89 }, 0, .short, .none }, + .{ .mov, .mr, &.{ .rm32, .r32 }, &.{ 0x89 }, 0, .none, .none }, + .{ .mov, .mr, &.{ .rm64, .r64 }, &.{ 0x89 }, 0, .long, .none }, + .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .none, .none }, + .{ .mov, .rm, &.{ .r8, .rm8 }, &.{ 0x8a }, 0, .rex, .none }, + .{ .mov, .rm, &.{ .r16, .rm16 }, &.{ 0x8b }, 0, .short, .none }, + .{ .mov, .rm, &.{ .r32, .rm32 }, &.{ 0x8b }, 0, .none, .none }, + .{ .mov, .rm, &.{ .r64, .rm64 }, &.{ 0x8b }, 0, .long, .none }, + .{ .mov, .mr, &.{ .rm16, .sreg }, &.{ 0x8c }, 0, .short, .none }, + .{ .mov, .mr, &.{ .r32_m16, .sreg }, &.{ 0x8c }, 0, .none, .none }, + .{ .mov, .mr, &.{ .r64_m16, .sreg }, &.{ 0x8c }, 0, .long, .none }, + .{ .mov, .rm, &.{ .sreg, .rm16 }, &.{ 0x8e }, 0, .short, .none }, + .{ .mov, .rm, &.{ .sreg, .r32_m16 }, &.{ 0x8e }, 0, .none, .none }, + .{ .mov, .rm, &.{ .sreg, .r64_m16 }, &.{ 0x8e }, 0, .long, .none }, + .{ .mov, .fd, &.{ .al, .moffs }, &.{ 0xa0 }, 0, .none, .none }, + .{ .mov, .fd, &.{ .ax, .moffs }, &.{ 0xa1 }, 0, .none, .none }, + .{ .mov, .fd, &.{ .eax, .moffs }, &.{ 0xa1 }, 0, .none, .none }, + .{ .mov, .fd, &.{ .rax, .moffs }, &.{ 0xa1 }, 0, .long, .none }, + .{ .mov, .td, &.{ .moffs, .al }, &.{ 0xa2 }, 0, .none, .none }, + .{ .mov, .td, &.{ .moffs, .ax }, &.{ 0xa3 }, 0, .none, .none }, + .{ .mov, .td, &.{ .moffs, .eax }, &.{ 0xa3 }, 0, .none, .none }, + .{ .mov, .td, &.{ .moffs, .rax }, &.{ 0xa3 }, 0, .long, .none }, + .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .none, .none }, + .{ .mov, .oi, &.{ .r8, .imm8 }, &.{ 0xb0 }, 0, .rex, .none }, + .{ .mov, .oi, &.{ .r16, .imm16 }, &.{ 0xb8 }, 0, .short, .none }, + .{ .mov, .oi, &.{ .r32, .imm32 }, &.{ 0xb8 }, 0, .none, .none }, + .{ .mov, .oi, &.{ .r64, .imm64 }, &.{ 0xb8 }, 0, .long, .none }, + .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .none, .none }, + .{ .mov, .mi, &.{ .rm8, .imm8 }, &.{ 0xc6 }, 0, .rex, .none }, + .{ .mov, .mi, &.{ .rm16, .imm16 }, &.{ 0xc7 }, 0, .short, .none }, + .{ .mov, .mi, &.{ .rm32, .imm32 }, &.{ 0xc7 }, 0, .none, .none }, + .{ .mov, .mi, &.{ .rm64, .imm32s }, &.{ 0xc7 }, 0, .long, .none }, + + .{ .movbe, .rm, &.{ .r16, .m16 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .short, .none }, + .{ .movbe, .rm, &.{ .r32, .m32 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .none }, + .{ .movbe, .rm, &.{ .r64, .m64 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .long, .none }, + .{ .movbe, .mr, &.{ .m16, .r16 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .short, .none }, + .{ .movbe, .mr, &.{ .m32, .r32 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .none }, + .{ .movbe, .mr, &.{ .m64, .r64 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .long, .none }, + + .{ .movs, .np, &.{ .m8, .m8 }, &.{ 0xa4 }, 0, .none, .none }, + .{ .movs, .np, &.{ .m16, .m16 }, &.{ 0xa5 }, 0, .short, .none }, + .{ .movs, .np, &.{ .m32, .m32 }, &.{ 0xa5 }, 0, .none, .none }, + .{ .movs, .np, &.{ .m64, .m64 }, &.{ 0xa5 }, 0, .long, .none }, .{ .movsb, .np, &.{}, &.{ 0xa4 }, 0, .none, .none }, .{ .movsw, .np, &.{}, &.{ 0xa5 }, 0, .short, .none }, .{ .movsd, .np, &.{}, &.{ 0xa5 }, 0, .none, .none }, .{ .movsq, .np, &.{}, &.{ 0xa5 }, 0, .long, .none }, - .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none, .none }, - .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex, .none }, - .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none, .none }, - .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex, .none }, - .{ .movsx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xbe }, 0, .long, .none }, - .{ .movsx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xbf }, 0, .none, .none }, - .{ .movsx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xbf }, 0, .long, .none }, + .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .short, .none }, + .{ .movsx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex_short, .none }, + .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .none, .none }, + .{ .movsx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xbe }, 0, .rex, .none }, + .{ .movsx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xbe }, 0, .long, .none }, + .{ .movsx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xbf }, 0, .none, .none }, + .{ .movsx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xbf }, 0, .long, .none }, // This instruction is discouraged. .{ .movsxd, .rm, &.{ .r32, .rm32 }, &.{ 0x63 }, 0, .none, .none }, .{ .movsxd, .rm, &.{ .r64, .rm32 }, &.{ 0x63 }, 0, .long, .none }, - .{ .movzx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none, .none }, - .{ .movzx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none, .none }, - .{ .movzx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .long, .none }, - .{ .movzx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .none, .none }, - .{ .movzx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .long, .none }, + .{ .movzx, .rm, &.{ .r16, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .short, .none }, + .{ .movzx, .rm, &.{ .r32, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .none, .none }, + .{ .movzx, .rm, &.{ .r64, .rm8 }, &.{ 0x0f, 0xb6 }, 0, .long, .none }, + .{ .movzx, .rm, &.{ .r32, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .none, .none }, + .{ .movzx, .rm, &.{ .r64, .rm16 }, &.{ 0x0f, 0xb7 }, 0, .long, .none }, - .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .none, .none }, - .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .rex, .none }, - .{ .mul, .m, &.{ .rm16 }, &.{ 0xf7 }, 4, .none, .none }, - .{ .mul, .m, &.{ .rm32 }, &.{ 0xf7 }, 4, .none, .none }, - .{ .mul, .m, &.{ .rm64 }, &.{ 0xf7 }, 4, .long, .none }, + .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .none, .none }, + .{ .mul, .m, &.{ .rm8 }, &.{ 0xf6 }, 4, .rex, .none }, + .{ .mul, .m, &.{ .rm16 }, &.{ 0xf7 }, 4, .short, .none }, + .{ .mul, .m, &.{ .rm32 }, &.{ 0xf7 }, 4, .none, .none }, + .{ .mul, .m, &.{ .rm64 }, &.{ 0xf7 }, 4, .long, .none }, - .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .none, .none }, - .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .rex, .none }, - .{ .neg, .m, &.{ .rm16 }, &.{ 0xf7 }, 3, .none, .none }, - .{ .neg, .m, &.{ .rm32 }, &.{ 0xf7 }, 3, .none, .none }, - .{ .neg, .m, &.{ .rm64 }, &.{ 0xf7 }, 3, .long, .none }, + .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .none, .none }, + .{ .neg, .m, &.{ .rm8 }, &.{ 0xf6 }, 3, .rex, .none }, + .{ .neg, .m, &.{ .rm16 }, &.{ 0xf7 }, 3, .short, .none }, + .{ .neg, .m, &.{ .rm32 }, &.{ 0xf7 }, 3, .none, .none }, + .{ .neg, .m, &.{ .rm64 }, &.{ 0xf7 }, 3, .long, .none }, .{ .nop, .np, &.{}, &.{ 0x90 }, 0, .none, .none }, - .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .none, .none }, - .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .rex, .none }, - .{ .not, .m, &.{ .rm16 }, &.{ 0xf7 }, 2, .none, .none }, - .{ .not, .m, &.{ .rm32 }, &.{ 0xf7 }, 2, .none, .none }, - .{ .not, .m, &.{ .rm64 }, &.{ 0xf7 }, 2, .long, .none }, - - .{ .@"or", .zi, &.{ .al, .imm8 }, &.{ 0x0c }, 0, .none, .none }, - .{ .@"or", .zi, &.{ .ax, .imm16 }, &.{ 0x0d }, 0, .none, .none }, - .{ .@"or", .zi, &.{ .eax, .imm32 }, &.{ 0x0d }, 0, .none, .none }, - .{ .@"or", .zi, &.{ .rax, .imm32s }, &.{ 0x0d }, 0, .long, .none }, - .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .none, .none }, - .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .rex, .none }, - .{ .@"or", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 1, .none, .none }, - .{ .@"or", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 1, .none, .none }, - .{ .@"or", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 1, .long, .none }, - .{ .@"or", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 1, .none, .none }, - .{ .@"or", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 1, .none, .none }, - .{ .@"or", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 1, .long, .none }, - .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .none, .none }, - .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .rex, .none }, - .{ .@"or", .mr, &.{ .rm16, .r16 }, &.{ 0x09 }, 0, .none, .none }, - .{ .@"or", .mr, &.{ .rm32, .r32 }, &.{ 0x09 }, 0, .none, .none }, - .{ .@"or", .mr, &.{ .rm64, .r64 }, &.{ 0x09 }, 0, .long, .none }, - .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .none, .none }, - .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .rex, .none }, - .{ .@"or", .rm, &.{ .r16, .rm16 }, &.{ 0x0b }, 0, .none, .none }, - .{ .@"or", .rm, &.{ .r32, .rm32 }, &.{ 0x0b }, 0, .none, .none }, - .{ .@"or", .rm, &.{ .r64, .rm64 }, &.{ 0x0b }, 0, .long, .none }, - - .{ .pop, .o, &.{ .r16 }, &.{ 0x58 }, 0, .none, .none }, - .{ .pop, .o, &.{ .r64 }, &.{ 0x58 }, 0, .none, .none }, - .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .none, .none }, - .{ .pop, .m, &.{ .rm64 }, &.{ 0x8f }, 0, .none, .none }, - - .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .none }, - .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .none }, - .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long, .none }, - - .{ .push, .o, &.{ .r16 }, &.{ 0x50 }, 0, .none, .none }, - .{ .push, .o, &.{ .r64 }, &.{ 0x50 }, 0, .none, .none }, - .{ .push, .m, &.{ .rm16 }, &.{ 0xff }, 6, .none, .none }, - .{ .push, .m, &.{ .rm64 }, &.{ 0xff }, 6, .none, .none }, - .{ .push, .i, &.{ .imm8 }, &.{ 0x6a }, 0, .none, .none }, - .{ .push, .i, &.{ .imm16 }, &.{ 0x68 }, 0, .none, .none }, - .{ .push, .i, &.{ .imm32 }, &.{ 0x68 }, 0, .none, .none }, + .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .none, .none }, + .{ .not, .m, &.{ .rm8 }, &.{ 0xf6 }, 2, .rex, .none }, + .{ .not, .m, &.{ .rm16 }, &.{ 0xf7 }, 2, .short, .none }, + .{ .not, .m, &.{ .rm32 }, &.{ 0xf7 }, 2, .none, .none }, + .{ .not, .m, &.{ .rm64 }, &.{ 0xf7 }, 2, .long, .none }, + + .{ .@"or", .zi, &.{ .al, .imm8 }, &.{ 0x0c }, 0, .none, .none }, + .{ .@"or", .zi, &.{ .ax, .imm16 }, &.{ 0x0d }, 0, .short, .none }, + .{ .@"or", .zi, &.{ .eax, .imm32 }, &.{ 0x0d }, 0, .none, .none }, + .{ .@"or", .zi, &.{ .rax, .imm32s }, &.{ 0x0d }, 0, .long, .none }, + .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 1, .rex, .none }, + .{ .@"or", .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 1, .short, .none }, + .{ .@"or", .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 1, .long, .none }, + .{ .@"or", .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 1, .short, .none }, + .{ .@"or", .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 1, .none, .none }, + .{ .@"or", .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 1, .long, .none }, + .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .none, .none }, + .{ .@"or", .mr, &.{ .rm8, .r8 }, &.{ 0x08 }, 0, .rex, .none }, + .{ .@"or", .mr, &.{ .rm16, .r16 }, &.{ 0x09 }, 0, .short, .none }, + .{ .@"or", .mr, &.{ .rm32, .r32 }, &.{ 0x09 }, 0, .none, .none }, + .{ .@"or", .mr, &.{ .rm64, .r64 }, &.{ 0x09 }, 0, .long, .none }, + .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .none, .none }, + .{ .@"or", .rm, &.{ .r8, .rm8 }, &.{ 0x0a }, 0, .rex, .none }, + .{ .@"or", .rm, &.{ .r16, .rm16 }, &.{ 0x0b }, 0, .short, .none }, + .{ .@"or", .rm, &.{ .r32, .rm32 }, &.{ 0x0b }, 0, .none, .none }, + .{ .@"or", .rm, &.{ .r64, .rm64 }, &.{ 0x0b }, 0, .long, .none }, + + .{ .pop, .o, &.{ .r16 }, &.{ 0x58 }, 0, .short, .none }, + .{ .pop, .o, &.{ .r64 }, &.{ 0x58 }, 0, .none, .none }, + .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .short, .none }, + .{ .pop, .m, &.{ .rm64 }, &.{ 0x8f }, 0, .none, .none }, + + .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .short, .none }, + .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .none }, + .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long, .none }, + + .{ .push, .o, &.{ .r16 }, &.{ 0x50 }, 0, .short, .none }, + .{ .push, .o, &.{ .r64 }, &.{ 0x50 }, 0, .none, .none }, + .{ .push, .m, &.{ .rm16 }, &.{ 0xff }, 6, .short, .none }, + .{ .push, .m, &.{ .rm64 }, &.{ 0xff }, 6, .none, .none }, + .{ .push, .i, &.{ .imm8 }, &.{ 0x6a }, 0, .none, .none }, + .{ .push, .i, &.{ .imm16 }, &.{ 0x68 }, 0, .short, .none }, + .{ .push, .i, &.{ .imm32 }, &.{ 0x68 }, 0, .none, .none }, .{ .ret, .np, &.{}, &.{ 0xc3 }, 0, .none, .none }, - .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .none, .none }, - .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .rex, .none }, - .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .none, .none }, - .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .rex, .none }, - .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .none, .none }, - .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .rex, .none }, - .{ .rcl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 2, .none, .none }, - .{ .rcl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 2, .none, .none }, - .{ .rcl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 2, .none, .none }, - .{ .rcl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 2, .none, .none }, - .{ .rcl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 2, .long, .none }, - .{ .rcl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 2, .none, .none }, - .{ .rcl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 2, .long, .none }, - .{ .rcl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 2, .none, .none }, - .{ .rcl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 2, .long, .none }, - - .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .none, .none }, - .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .rex, .none }, - .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .none, .none }, - .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .rex, .none }, - .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .none, .none }, - .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .rex, .none }, - .{ .rcr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 3, .none, .none }, - .{ .rcr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 3, .none, .none }, - .{ .rcr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 3, .none, .none }, - .{ .rcr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 3, .none, .none }, - .{ .rcr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 3, .long, .none }, - .{ .rcr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 3, .none, .none }, - .{ .rcr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 3, .long, .none }, - .{ .rcr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 3, .none, .none }, - .{ .rcr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 3, .long, .none }, - - .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .none, .none }, - .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .rex, .none }, - .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .none, .none }, - .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .rex, .none }, - .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .none, .none }, - .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .rex, .none }, - .{ .rol, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 0, .none, .none }, - .{ .rol, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 0, .none, .none }, - .{ .rol, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 0, .none, .none }, - .{ .rol, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 0, .none, .none }, - .{ .rol, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 0, .long, .none }, - .{ .rol, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 0, .none, .none }, - .{ .rol, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 0, .long, .none }, - .{ .rol, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 0, .none, .none }, - .{ .rol, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 0, .long, .none }, - - .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .none, .none }, - .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .rex, .none }, - .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .none, .none }, - .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .rex, .none }, - .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .none, .none }, - .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .rex, .none }, - .{ .ror, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 1, .none, .none }, - .{ .ror, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 1, .none, .none }, - .{ .ror, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 1, .none, .none }, - .{ .ror, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 1, .none, .none }, - .{ .ror, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 1, .long, .none }, - .{ .ror, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 1, .none, .none }, - .{ .ror, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 1, .long, .none }, - .{ .ror, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 1, .none, .none }, - .{ .ror, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 1, .long, .none }, - - .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none, .none }, - .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex, .none }, - .{ .sal, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .none, .none }, - .{ .sal, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none, .none }, - .{ .sal, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long, .none }, - .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none, .none }, - .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex, .none }, - .{ .sal, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .none, .none }, - .{ .sal, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none, .none }, - .{ .sal, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long, .none }, - .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none, .none }, - .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex, .none }, - .{ .sal, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, - .{ .sal, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, - .{ .sal, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long, .none }, - - .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .none, .none }, - .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .rex, .none }, - .{ .sar, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 7, .none, .none }, - .{ .sar, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 7, .none, .none }, - .{ .sar, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 7, .long, .none }, - .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .none, .none }, - .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .rex, .none }, - .{ .sar, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 7, .none, .none }, - .{ .sar, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 7, .none, .none }, - .{ .sar, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 7, .long, .none }, - .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .none, .none }, - .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .rex, .none }, - .{ .sar, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 7, .none, .none }, - .{ .sar, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 7, .none, .none }, - .{ .sar, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 7, .long, .none }, - - .{ .sbb, .zi, &.{ .al, .imm8 }, &.{ 0x1c }, 0, .none, .none }, - .{ .sbb, .zi, &.{ .ax, .imm16 }, &.{ 0x1d }, 0, .none, .none }, - .{ .sbb, .zi, &.{ .eax, .imm32 }, &.{ 0x1d }, 0, .none, .none }, - .{ .sbb, .zi, &.{ .rax, .imm32s }, &.{ 0x1d }, 0, .long, .none }, - .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .none, .none }, - .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .rex, .none }, - .{ .sbb, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 3, .none, .none }, - .{ .sbb, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 3, .none, .none }, - .{ .sbb, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 3, .long, .none }, - .{ .sbb, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 3, .none, .none }, - .{ .sbb, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 3, .none, .none }, - .{ .sbb, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 3, .long, .none }, - .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .none, .none }, - .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .rex, .none }, - .{ .sbb, .mr, &.{ .rm16, .r16 }, &.{ 0x19 }, 0, .none, .none }, - .{ .sbb, .mr, &.{ .rm32, .r32 }, &.{ 0x19 }, 0, .none, .none }, - .{ .sbb, .mr, &.{ .rm64, .r64 }, &.{ 0x19 }, 0, .long, .none }, - .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .none, .none }, - .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .rex, .none }, - .{ .sbb, .rm, &.{ .r16, .rm16 }, &.{ 0x1b }, 0, .none, .none }, - .{ .sbb, .rm, &.{ .r32, .rm32 }, &.{ 0x1b }, 0, .none, .none }, - .{ .sbb, .rm, &.{ .r64, .rm64 }, &.{ 0x1b }, 0, .long, .none }, - - .{ .scas, .np, &.{ .m8 }, &.{ 0xae }, 0, .none, .none }, - .{ .scas, .np, &.{ .m16 }, &.{ 0xaf }, 0, .none, .none }, - .{ .scas, .np, &.{ .m32 }, &.{ 0xaf }, 0, .none, .none }, - .{ .scas, .np, &.{ .m64 }, &.{ 0xaf }, 0, .long, .none }, + .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .none, .none }, + .{ .rcl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 2, .rex, .none }, + .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .none, .none }, + .{ .rcl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 2, .rex, .none }, + .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .none, .none }, + .{ .rcl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 2, .rex, .none }, + .{ .rcl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 2, .short, .none }, + .{ .rcl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 2, .short, .none }, + .{ .rcl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 2, .short, .none }, + .{ .rcl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 2, .none, .none }, + .{ .rcl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 2, .long, .none }, + .{ .rcl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 2, .none, .none }, + .{ .rcl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 2, .long, .none }, + .{ .rcl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 2, .none, .none }, + .{ .rcl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 2, .long, .none }, + + .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .none, .none }, + .{ .rcr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 3, .rex, .none }, + .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .none, .none }, + .{ .rcr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 3, .rex, .none }, + .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .none, .none }, + .{ .rcr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 3, .rex, .none }, + .{ .rcr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 3, .short, .none }, + .{ .rcr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 3, .short, .none }, + .{ .rcr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 3, .short, .none }, + .{ .rcr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 3, .none, .none }, + .{ .rcr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 3, .long, .none }, + .{ .rcr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 3, .none, .none }, + .{ .rcr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 3, .long, .none }, + .{ .rcr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 3, .none, .none }, + .{ .rcr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 3, .long, .none }, + + .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .none, .none }, + .{ .rol, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 0, .rex, .none }, + .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .none, .none }, + .{ .rol, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 0, .rex, .none }, + .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .none, .none }, + .{ .rol, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 0, .rex, .none }, + .{ .rol, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 0, .short, .none }, + .{ .rol, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 0, .short, .none }, + .{ .rol, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 0, .short, .none }, + .{ .rol, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 0, .none, .none }, + .{ .rol, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 0, .long, .none }, + .{ .rol, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 0, .none, .none }, + .{ .rol, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 0, .long, .none }, + .{ .rol, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 0, .none, .none }, + .{ .rol, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 0, .long, .none }, + + .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .none, .none }, + .{ .ror, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 1, .rex, .none }, + .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .none, .none }, + .{ .ror, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 1, .rex, .none }, + .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .none, .none }, + .{ .ror, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 1, .rex, .none }, + .{ .ror, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 1, .short, .none }, + .{ .ror, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 1, .short, .none }, + .{ .ror, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 1, .short, .none }, + .{ .ror, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 1, .none, .none }, + .{ .ror, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 1, .long, .none }, + .{ .ror, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 1, .none, .none }, + .{ .ror, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 1, .long, .none }, + .{ .ror, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 1, .none, .none }, + .{ .ror, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 1, .long, .none }, + + .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none, .none }, + .{ .sal, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex, .none }, + .{ .sal, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .short, .none }, + .{ .sal, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none, .none }, + .{ .sal, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long, .none }, + .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none, .none }, + .{ .sal, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex, .none }, + .{ .sal, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .short, .none }, + .{ .sal, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none, .none }, + .{ .sal, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long, .none }, + .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none, .none }, + .{ .sal, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex, .none }, + .{ .sal, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .short, .none }, + .{ .sal, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, + .{ .sal, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long, .none }, + + .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .none, .none }, + .{ .sar, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 7, .rex, .none }, + .{ .sar, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 7, .short, .none }, + .{ .sar, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 7, .none, .none }, + .{ .sar, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 7, .long, .none }, + .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .none, .none }, + .{ .sar, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 7, .rex, .none }, + .{ .sar, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 7, .short, .none }, + .{ .sar, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 7, .none, .none }, + .{ .sar, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 7, .long, .none }, + .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .none, .none }, + .{ .sar, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 7, .rex, .none }, + .{ .sar, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 7, .short, .none }, + .{ .sar, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 7, .none, .none }, + .{ .sar, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 7, .long, .none }, + + .{ .sbb, .zi, &.{ .al, .imm8 }, &.{ 0x1c }, 0, .none, .none }, + .{ .sbb, .zi, &.{ .ax, .imm16 }, &.{ 0x1d }, 0, .short, .none }, + .{ .sbb, .zi, &.{ .eax, .imm32 }, &.{ 0x1d }, 0, .none, .none }, + .{ .sbb, .zi, &.{ .rax, .imm32s }, &.{ 0x1d }, 0, .long, .none }, + .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 3, .rex, .none }, + .{ .sbb, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 3, .short, .none }, + .{ .sbb, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 3, .long, .none }, + .{ .sbb, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 3, .short, .none }, + .{ .sbb, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 3, .none, .none }, + .{ .sbb, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 3, .long, .none }, + .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .none, .none }, + .{ .sbb, .mr, &.{ .rm8, .r8 }, &.{ 0x18 }, 0, .rex, .none }, + .{ .sbb, .mr, &.{ .rm16, .r16 }, &.{ 0x19 }, 0, .short, .none }, + .{ .sbb, .mr, &.{ .rm32, .r32 }, &.{ 0x19 }, 0, .none, .none }, + .{ .sbb, .mr, &.{ .rm64, .r64 }, &.{ 0x19 }, 0, .long, .none }, + .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .none, .none }, + .{ .sbb, .rm, &.{ .r8, .rm8 }, &.{ 0x1a }, 0, .rex, .none }, + .{ .sbb, .rm, &.{ .r16, .rm16 }, &.{ 0x1b }, 0, .short, .none }, + .{ .sbb, .rm, &.{ .r32, .rm32 }, &.{ 0x1b }, 0, .none, .none }, + .{ .sbb, .rm, &.{ .r64, .rm64 }, &.{ 0x1b }, 0, .long, .none }, + + .{ .scas, .np, &.{ .m8 }, &.{ 0xae }, 0, .none, .none }, + .{ .scas, .np, &.{ .m16 }, &.{ 0xaf }, 0, .short, .none }, + .{ .scas, .np, &.{ .m32 }, &.{ 0xaf }, 0, .none, .none }, + .{ .scas, .np, &.{ .m64 }, &.{ 0xaf }, 0, .long, .none }, .{ .scasb, .np, &.{}, &.{ 0xae }, 0, .none, .none }, .{ .scasw, .np, &.{}, &.{ 0xaf }, 0, .short, .none }, @@ -682,153 +684,153 @@ pub const table = [_]Entry{ .{ .sfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf8 }, 0, .none, .none }, - .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none, .none }, - .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex, .none }, - .{ .shl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .none, .none }, - .{ .shl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none, .none }, - .{ .shl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long, .none }, - .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none, .none }, - .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex, .none }, - .{ .shl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .none, .none }, - .{ .shl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none, .none }, - .{ .shl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long, .none }, - .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none, .none }, - .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex, .none }, - .{ .shl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, - .{ .shl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, - .{ .shl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long, .none }, - - .{ .shld, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none, .none }, - .{ .shld, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xa5 }, 0, .none, .none }, - .{ .shld, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none, .none }, - .{ .shld, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .long, .none }, - .{ .shld, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xa5 }, 0, .none, .none }, - .{ .shld, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xa5 }, 0, .long, .none }, - - .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .none, .none }, - .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .rex, .none }, - .{ .shr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 5, .none, .none }, - .{ .shr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 5, .none, .none }, - .{ .shr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 5, .long, .none }, - .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .none, .none }, - .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .rex, .none }, - .{ .shr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 5, .none, .none }, - .{ .shr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 5, .none, .none }, - .{ .shr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 5, .long, .none }, - .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .none, .none }, - .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .rex, .none }, - .{ .shr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 5, .none, .none }, - .{ .shr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 5, .none, .none }, - .{ .shr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 5, .long, .none }, - - .{ .shrd, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xac }, 0, .none, .none }, - .{ .shrd, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xad }, 0, .none, .none }, - .{ .shrd, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xac }, 0, .none, .none }, - .{ .shrd, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xac }, 0, .long, .none }, - .{ .shrd, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xad }, 0, .none, .none }, - .{ .shrd, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xad }, 0, .long, .none }, - - .{ .stos, .np, &.{ .m8 }, &.{ 0xaa }, 0, .none, .none }, - .{ .stos, .np, &.{ .m16 }, &.{ 0xab }, 0, .none, .none }, - .{ .stos, .np, &.{ .m32 }, &.{ 0xab }, 0, .none, .none }, - .{ .stos, .np, &.{ .m64 }, &.{ 0xab }, 0, .long, .none }, + .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .none, .none }, + .{ .shl, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 4, .rex, .none }, + .{ .shl, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 4, .short, .none }, + .{ .shl, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 4, .none, .none }, + .{ .shl, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 4, .long, .none }, + .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .none, .none }, + .{ .shl, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 4, .rex, .none }, + .{ .shl, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 4, .short, .none }, + .{ .shl, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 4, .none, .none }, + .{ .shl, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 4, .long, .none }, + .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .none, .none }, + .{ .shl, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 4, .rex, .none }, + .{ .shl, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 4, .short, .none }, + .{ .shl, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 4, .none, .none }, + .{ .shl, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 4, .long, .none }, + + .{ .shld, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .short, .none }, + .{ .shld, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xa5 }, 0, .short, .none }, + .{ .shld, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .none, .none }, + .{ .shld, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xa4 }, 0, .long, .none }, + .{ .shld, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xa5 }, 0, .none, .none }, + .{ .shld, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xa5 }, 0, .long, .none }, + + .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .none, .none }, + .{ .shr, .m1, &.{ .rm8, .unity }, &.{ 0xd0 }, 5, .rex, .none }, + .{ .shr, .m1, &.{ .rm16, .unity }, &.{ 0xd1 }, 5, .short, .none }, + .{ .shr, .m1, &.{ .rm32, .unity }, &.{ 0xd1 }, 5, .none, .none }, + .{ .shr, .m1, &.{ .rm64, .unity }, &.{ 0xd1 }, 5, .long, .none }, + .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .none, .none }, + .{ .shr, .mc, &.{ .rm8, .cl }, &.{ 0xd2 }, 5, .rex, .none }, + .{ .shr, .mc, &.{ .rm16, .cl }, &.{ 0xd3 }, 5, .short, .none }, + .{ .shr, .mc, &.{ .rm32, .cl }, &.{ 0xd3 }, 5, .none, .none }, + .{ .shr, .mc, &.{ .rm64, .cl }, &.{ 0xd3 }, 5, .long, .none }, + .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .none, .none }, + .{ .shr, .mi, &.{ .rm8, .imm8 }, &.{ 0xc0 }, 5, .rex, .none }, + .{ .shr, .mi, &.{ .rm16, .imm8 }, &.{ 0xc1 }, 5, .short, .none }, + .{ .shr, .mi, &.{ .rm32, .imm8 }, &.{ 0xc1 }, 5, .none, .none }, + .{ .shr, .mi, &.{ .rm64, .imm8 }, &.{ 0xc1 }, 5, .long, .none }, + + .{ .shrd, .mri, &.{ .rm16, .r16, .imm8 }, &.{ 0x0f, 0xac }, 0, .short, .none }, + .{ .shrd, .mrc, &.{ .rm16, .r16, .cl }, &.{ 0x0f, 0xad }, 0, .short, .none }, + .{ .shrd, .mri, &.{ .rm32, .r32, .imm8 }, &.{ 0x0f, 0xac }, 0, .none, .none }, + .{ .shrd, .mri, &.{ .rm64, .r64, .imm8 }, &.{ 0x0f, 0xac }, 0, .long, .none }, + .{ .shrd, .mrc, &.{ .rm32, .r32, .cl }, &.{ 0x0f, 0xad }, 0, .none, .none }, + .{ .shrd, .mrc, &.{ .rm64, .r64, .cl }, &.{ 0x0f, 0xad }, 0, .long, .none }, + + .{ .stos, .np, &.{ .m8 }, &.{ 0xaa }, 0, .none, .none }, + .{ .stos, .np, &.{ .m16 }, &.{ 0xab }, 0, .short, .none }, + .{ .stos, .np, &.{ .m32 }, &.{ 0xab }, 0, .none, .none }, + .{ .stos, .np, &.{ .m64 }, &.{ 0xab }, 0, .long, .none }, .{ .stosb, .np, &.{}, &.{ 0xaa }, 0, .none, .none }, .{ .stosw, .np, &.{}, &.{ 0xab }, 0, .short, .none }, .{ .stosd, .np, &.{}, &.{ 0xab }, 0, .none, .none }, .{ .stosq, .np, &.{}, &.{ 0xab }, 0, .long, .none }, - .{ .sub, .zi, &.{ .al, .imm8 }, &.{ 0x2c }, 0, .none, .none }, - .{ .sub, .zi, &.{ .ax, .imm16 }, &.{ 0x2d }, 0, .none, .none }, - .{ .sub, .zi, &.{ .eax, .imm32 }, &.{ 0x2d }, 0, .none, .none }, - .{ .sub, .zi, &.{ .rax, .imm32s }, &.{ 0x2d }, 0, .long, .none }, - .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .none, .none }, - .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .rex, .none }, - .{ .sub, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 5, .none, .none }, - .{ .sub, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 5, .none, .none }, - .{ .sub, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 5, .long, .none }, - .{ .sub, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 5, .none, .none }, - .{ .sub, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 5, .none, .none }, - .{ .sub, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 5, .long, .none }, - .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .none, .none }, - .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .rex, .none }, - .{ .sub, .mr, &.{ .rm16, .r16 }, &.{ 0x29 }, 0, .none, .none }, - .{ .sub, .mr, &.{ .rm32, .r32 }, &.{ 0x29 }, 0, .none, .none }, - .{ .sub, .mr, &.{ .rm64, .r64 }, &.{ 0x29 }, 0, .long, .none }, - .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .none, .none }, - .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .rex, .none }, - .{ .sub, .rm, &.{ .r16, .rm16 }, &.{ 0x2b }, 0, .none, .none }, - .{ .sub, .rm, &.{ .r32, .rm32 }, &.{ 0x2b }, 0, .none, .none }, - .{ .sub, .rm, &.{ .r64, .rm64 }, &.{ 0x2b }, 0, .long, .none }, + .{ .sub, .zi, &.{ .al, .imm8 }, &.{ 0x2c }, 0, .none, .none }, + .{ .sub, .zi, &.{ .ax, .imm16 }, &.{ 0x2d }, 0, .short, .none }, + .{ .sub, .zi, &.{ .eax, .imm32 }, &.{ 0x2d }, 0, .none, .none }, + .{ .sub, .zi, &.{ .rax, .imm32s }, &.{ 0x2d }, 0, .long, .none }, + .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 5, .rex, .none }, + .{ .sub, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 5, .short, .none }, + .{ .sub, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 5, .long, .none }, + .{ .sub, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 5, .short, .none }, + .{ .sub, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 5, .none, .none }, + .{ .sub, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 5, .long, .none }, + .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .none, .none }, + .{ .sub, .mr, &.{ .rm8, .r8 }, &.{ 0x28 }, 0, .rex, .none }, + .{ .sub, .mr, &.{ .rm16, .r16 }, &.{ 0x29 }, 0, .short, .none }, + .{ .sub, .mr, &.{ .rm32, .r32 }, &.{ 0x29 }, 0, .none, .none }, + .{ .sub, .mr, &.{ .rm64, .r64 }, &.{ 0x29 }, 0, .long, .none }, + .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .none, .none }, + .{ .sub, .rm, &.{ .r8, .rm8 }, &.{ 0x2a }, 0, .rex, .none }, + .{ .sub, .rm, &.{ .r16, .rm16 }, &.{ 0x2b }, 0, .short, .none }, + .{ .sub, .rm, &.{ .r32, .rm32 }, &.{ 0x2b }, 0, .none, .none }, + .{ .sub, .rm, &.{ .r64, .rm64 }, &.{ 0x2b }, 0, .long, .none }, .{ .syscall, .np, &.{}, &.{ 0x0f, 0x05 }, 0, .none, .none }, - .{ .@"test", .zi, &.{ .al, .imm8 }, &.{ 0xa8 }, 0, .none, .none }, - .{ .@"test", .zi, &.{ .ax, .imm16 }, &.{ 0xa9 }, 0, .none, .none }, - .{ .@"test", .zi, &.{ .eax, .imm32 }, &.{ 0xa9 }, 0, .none, .none }, - .{ .@"test", .zi, &.{ .rax, .imm32s }, &.{ 0xa9 }, 0, .long, .none }, - .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .none, .none }, - .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .rex, .none }, - .{ .@"test", .mi, &.{ .rm16, .imm16 }, &.{ 0xf7 }, 0, .none, .none }, - .{ .@"test", .mi, &.{ .rm32, .imm32 }, &.{ 0xf7 }, 0, .none, .none }, - .{ .@"test", .mi, &.{ .rm64, .imm32s }, &.{ 0xf7 }, 0, .long, .none }, - .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .none, .none }, - .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .rex, .none }, - .{ .@"test", .mr, &.{ .rm16, .r16 }, &.{ 0x85 }, 0, .none, .none }, - .{ .@"test", .mr, &.{ .rm32, .r32 }, &.{ 0x85 }, 0, .none, .none }, - .{ .@"test", .mr, &.{ .rm64, .r64 }, &.{ 0x85 }, 0, .long, .none }, - - .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .none }, - .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .none }, - .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long, .none }, + .{ .@"test", .zi, &.{ .al, .imm8 }, &.{ 0xa8 }, 0, .none, .none }, + .{ .@"test", .zi, &.{ .ax, .imm16 }, &.{ 0xa9 }, 0, .short, .none }, + .{ .@"test", .zi, &.{ .eax, .imm32 }, &.{ 0xa9 }, 0, .none, .none }, + .{ .@"test", .zi, &.{ .rax, .imm32s }, &.{ 0xa9 }, 0, .long, .none }, + .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .none, .none }, + .{ .@"test", .mi, &.{ .rm8, .imm8 }, &.{ 0xf6 }, 0, .rex, .none }, + .{ .@"test", .mi, &.{ .rm16, .imm16 }, &.{ 0xf7 }, 0, .short, .none }, + .{ .@"test", .mi, &.{ .rm32, .imm32 }, &.{ 0xf7 }, 0, .none, .none }, + .{ .@"test", .mi, &.{ .rm64, .imm32s }, &.{ 0xf7 }, 0, .long, .none }, + .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .none, .none }, + .{ .@"test", .mr, &.{ .rm8, .r8 }, &.{ 0x84 }, 0, .rex, .none }, + .{ .@"test", .mr, &.{ .rm16, .r16 }, &.{ 0x85 }, 0, .short, .none }, + .{ .@"test", .mr, &.{ .rm32, .r32 }, &.{ 0x85 }, 0, .none, .none }, + .{ .@"test", .mr, &.{ .rm64, .r64 }, &.{ 0x85 }, 0, .long, .none }, + + .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .short, .none }, + .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .none }, + .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long, .none }, .{ .ud2, .np, &.{}, &.{ 0x0f, 0x0b }, 0, .none, .none }, - .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .none, .none }, - .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .rex, .none }, - .{ .xadd, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xc1 }, 0, .none, .none }, - .{ .xadd, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xc1 }, 0, .none, .none }, - .{ .xadd, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xc1 }, 0, .long, .none }, - - .{ .xchg, .o, &.{ .ax, .r16 }, &.{ 0x90 }, 0, .none, .none }, - .{ .xchg, .o, &.{ .r16, .ax }, &.{ 0x90 }, 0, .none, .none }, - .{ .xchg, .o, &.{ .eax, .r32 }, &.{ 0x90 }, 0, .none, .none }, - .{ .xchg, .o, &.{ .rax, .r64 }, &.{ 0x90 }, 0, .long, .none }, - .{ .xchg, .o, &.{ .r32, .eax }, &.{ 0x90 }, 0, .none, .none }, - .{ .xchg, .o, &.{ .r64, .rax }, &.{ 0x90 }, 0, .long, .none }, - .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .none, .none }, - .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .rex, .none }, - .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .none, .none }, - .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .rex, .none }, - .{ .xchg, .mr, &.{ .rm16, .r16 }, &.{ 0x87 }, 0, .none, .none }, - .{ .xchg, .rm, &.{ .r16, .rm16 }, &.{ 0x87 }, 0, .none, .none }, - .{ .xchg, .mr, &.{ .rm32, .r32 }, &.{ 0x87 }, 0, .none, .none }, - .{ .xchg, .mr, &.{ .rm64, .r64 }, &.{ 0x87 }, 0, .long, .none }, - .{ .xchg, .rm, &.{ .r32, .rm32 }, &.{ 0x87 }, 0, .none, .none }, - .{ .xchg, .rm, &.{ .r64, .rm64 }, &.{ 0x87 }, 0, .long, .none }, - - .{ .xor, .zi, &.{ .al, .imm8 }, &.{ 0x34 }, 0, .none, .none }, - .{ .xor, .zi, &.{ .ax, .imm16 }, &.{ 0x35 }, 0, .none, .none }, - .{ .xor, .zi, &.{ .eax, .imm32 }, &.{ 0x35 }, 0, .none, .none }, - .{ .xor, .zi, &.{ .rax, .imm32s }, &.{ 0x35 }, 0, .long, .none }, - .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .none, .none }, - .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .rex, .none }, - .{ .xor, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 6, .none, .none }, - .{ .xor, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 6, .none, .none }, - .{ .xor, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 6, .long, .none }, - .{ .xor, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 6, .none, .none }, - .{ .xor, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 6, .none, .none }, - .{ .xor, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 6, .long, .none }, - .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .none, .none }, - .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .rex, .none }, - .{ .xor, .mr, &.{ .rm16, .r16 }, &.{ 0x31 }, 0, .none, .none }, - .{ .xor, .mr, &.{ .rm32, .r32 }, &.{ 0x31 }, 0, .none, .none }, - .{ .xor, .mr, &.{ .rm64, .r64 }, &.{ 0x31 }, 0, .long, .none }, - .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .none, .none }, - .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .rex, .none }, - .{ .xor, .rm, &.{ .r16, .rm16 }, &.{ 0x33 }, 0, .none, .none }, - .{ .xor, .rm, &.{ .r32, .rm32 }, &.{ 0x33 }, 0, .none, .none }, - .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none }, + .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .none, .none }, + .{ .xadd, .mr, &.{ .rm8, .r8 }, &.{ 0x0f, 0xc0 }, 0, .rex, .none }, + .{ .xadd, .mr, &.{ .rm16, .r16 }, &.{ 0x0f, 0xc1 }, 0, .short, .none }, + .{ .xadd, .mr, &.{ .rm32, .r32 }, &.{ 0x0f, 0xc1 }, 0, .none, .none }, + .{ .xadd, .mr, &.{ .rm64, .r64 }, &.{ 0x0f, 0xc1 }, 0, .long, .none }, + + .{ .xchg, .o, &.{ .ax, .r16 }, &.{ 0x90 }, 0, .short, .none }, + .{ .xchg, .o, &.{ .r16, .ax }, &.{ 0x90 }, 0, .short, .none }, + .{ .xchg, .o, &.{ .eax, .r32 }, &.{ 0x90 }, 0, .none, .none }, + .{ .xchg, .o, &.{ .rax, .r64 }, &.{ 0x90 }, 0, .long, .none }, + .{ .xchg, .o, &.{ .r32, .eax }, &.{ 0x90 }, 0, .none, .none }, + .{ .xchg, .o, &.{ .r64, .rax }, &.{ 0x90 }, 0, .long, .none }, + .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .none, .none }, + .{ .xchg, .mr, &.{ .rm8, .r8 }, &.{ 0x86 }, 0, .rex, .none }, + .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .none, .none }, + .{ .xchg, .rm, &.{ .r8, .rm8 }, &.{ 0x86 }, 0, .rex, .none }, + .{ .xchg, .mr, &.{ .rm16, .r16 }, &.{ 0x87 }, 0, .short, .none }, + .{ .xchg, .rm, &.{ .r16, .rm16 }, &.{ 0x87 }, 0, .short, .none }, + .{ .xchg, .mr, &.{ .rm32, .r32 }, &.{ 0x87 }, 0, .none, .none }, + .{ .xchg, .mr, &.{ .rm64, .r64 }, &.{ 0x87 }, 0, .long, .none }, + .{ .xchg, .rm, &.{ .r32, .rm32 }, &.{ 0x87 }, 0, .none, .none }, + .{ .xchg, .rm, &.{ .r64, .rm64 }, &.{ 0x87 }, 0, .long, .none }, + + .{ .xor, .zi, &.{ .al, .imm8 }, &.{ 0x34 }, 0, .none, .none }, + .{ .xor, .zi, &.{ .ax, .imm16 }, &.{ 0x35 }, 0, .short, .none }, + .{ .xor, .zi, &.{ .eax, .imm32 }, &.{ 0x35 }, 0, .none, .none }, + .{ .xor, .zi, &.{ .rax, .imm32s }, &.{ 0x35 }, 0, .long, .none }, + .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm8, .imm8 }, &.{ 0x80 }, 6, .rex, .none }, + .{ .xor, .mi, &.{ .rm16, .imm16 }, &.{ 0x81 }, 6, .short, .none }, + .{ .xor, .mi, &.{ .rm32, .imm32 }, &.{ 0x81 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm64, .imm32s }, &.{ 0x81 }, 6, .long, .none }, + .{ .xor, .mi, &.{ .rm16, .imm8s }, &.{ 0x83 }, 6, .short, .none }, + .{ .xor, .mi, &.{ .rm32, .imm8s }, &.{ 0x83 }, 6, .none, .none }, + .{ .xor, .mi, &.{ .rm64, .imm8s }, &.{ 0x83 }, 6, .long, .none }, + .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .none, .none }, + .{ .xor, .mr, &.{ .rm8, .r8 }, &.{ 0x30 }, 0, .rex, .none }, + .{ .xor, .mr, &.{ .rm16, .r16 }, &.{ 0x31 }, 0, .short, .none }, + .{ .xor, .mr, &.{ .rm32, .r32 }, &.{ 0x31 }, 0, .none, .none }, + .{ .xor, .mr, &.{ .rm64, .r64 }, &.{ 0x31 }, 0, .long, .none }, + .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .none, .none }, + .{ .xor, .rm, &.{ .r8, .rm8 }, &.{ 0x32 }, 0, .rex, .none }, + .{ .xor, .rm, &.{ .r16, .rm16 }, &.{ 0x33 }, 0, .short, .none }, + .{ .xor, .rm, &.{ .r32, .rm32 }, &.{ 0x33 }, 0, .none, .none }, + .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none }, // SSE .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .none, .sse }, @@ -911,9 +913,39 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, - .{ .pextrw, .mri, &.{ .r16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .none, .sse2 }, + .{ .pextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .none, .sse2 }, + .{ .pextrw, .rmi, &.{ .r64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .long, .sse2 }, - .{ .pinsrw, .rmi, &.{ .xmm, .rm16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, + .{ .pinsrw, .rmi, &.{ .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, + + .{ .pshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .none, .sse2 }, + + .{ .pshuflw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf2, 0x0f, 0x70 }, 0, .none, .sse2 }, + + .{ .psrld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .none, .sse2 }, + .{ .psrld, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 2, .none, .sse2 }, + + .{ .psrlq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .none, .sse2 }, + .{ .psrlq, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .none, .sse2 }, + + .{ .psrlw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .none, .sse2 }, + .{ .psrlw, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .none, .sse2 }, + + .{ .punpckhbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .none, .sse2 }, + + .{ .punpckhdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .none, .sse2 }, + + .{ .punpckhqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6d }, 0, .none, .sse2 }, + + .{ .punpckhwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .none, .sse2 }, + + .{ .punpcklbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x60 }, 0, .none, .sse2 }, + + .{ .punpckldq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 }, + + .{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 }, + + .{ .punpcklwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x61 }, 0, .none, .sse2 }, .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 }, .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, @@ -927,12 +959,59 @@ pub const table = [_]Entry{ .{ .xorpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .none, .sse2 }, + // SSE3 + .{ .movddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .none, .sse3 }, + + .{ .movshdup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .none, .sse3 }, + + .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 }, + // SSE4.1 - .{ .pextrw, .mri, &.{ .rm16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, + .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, + .{ .pextrw, .mri, &.{ .r64_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .long, .sse4_1 }, .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 }, + // AVX + .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128, .avx }, + + .{ .vmovshdup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .vex_128, .avx }, + + .{ .vmovsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .vex_128, .avx }, + + .{ .vpextrw, .mri, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128, .avx }, + .{ .vpextrw, .mri, &.{ .r64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_long, .avx }, + .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128, .avx }, + .{ .vpextrw, .mri, &.{ .r64_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_long, .avx }, + + .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128, .avx }, + + .{ .vpsrld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_128, .avx }, + .{ .vpsrld, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 2, .vex_128, .avx }, + + .{ .vpsrlq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_128, .avx }, + .{ .vpsrlq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_128, .avx }, + + .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128, .avx }, + .{ .vpsrlw, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_128, .avx }, + + .{ .vpunpckhbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_128, .avx }, + + .{ .vpunpckhdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_128, .avx }, + + .{ .vpunpckhqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6d }, 0, .vex_128, .avx }, + + .{ .vpunpckhwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_128, .avx }, + + .{ .vpunpcklbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x60 }, 0, .vex_128, .avx }, + + .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128, .avx }, + + .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128, .avx }, + + .{ .vpunpcklwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x61 }, 0, .vex_128, .avx }, + // F16C .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128, .f16c }, -- cgit v1.2.3 From 3a5e3c52e0f09112989a2a40345305bfe9508431 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sat, 6 May 2023 20:31:48 -0400 Subject: x86_64: implement `@mulAdd` --- src/arch/x86_64/CodeGen.zig | 169 +++++++++++++++++++++++++++++++++++++++++- src/arch/x86_64/Encoding.zig | 24 ++++-- src/arch/x86_64/Lower.zig | 22 ++++++ src/arch/x86_64/Mir.zig | 31 ++++++++ src/arch/x86_64/bits.zig | 16 +++- src/arch/x86_64/encodings.zig | 23 ++++++ test/behavior/muladd.zig | 6 +- 7 files changed, 277 insertions(+), 14 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index befd5be0fd..fffb814d7f 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1200,6 +1200,32 @@ fn asmRegisterRegisterImmediate( }); } +fn asmRegisterRegisterMemory( + self: *Self, + tag: Mir.Inst.Tag, + reg1: Register, + reg2: Register, + m: Memory, +) !void { + _ = try self.addInst(.{ + .tag = tag, + .ops = switch (m) { + .sib => .rrm_sib, + .rip => .rrm_rip, + else => unreachable, + }, + .data = .{ .rrx = .{ + .r1 = reg1, + .r2 = reg2, + .payload = switch (m) { + .sib => try self.addExtra(Mir.MemorySib.encode(m)), + .rip => try self.addExtra(Mir.MemoryRip.encode(m)), + else => unreachable, + }, + } }, + }); +} + fn asmMemory(self: *Self, tag: Mir.Inst.Tag, m: Memory) !void { _ = try self.addInst(.{ .tag = tag, @@ -9369,9 +9395,146 @@ fn airPrefetch(self: *Self, inst: Air.Inst.Index) !void { fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { const pl_op = self.air.instructions.items(.data)[inst].pl_op; const extra = self.air.extraData(Air.Bin, pl_op.payload).data; - _ = extra; - return self.fail("TODO implement airMulAdd for x86_64", .{}); - //return self.finishAir(inst, result, .{ extra.lhs, extra.rhs, pl_op.operand }); + const ty = self.air.typeOfIndex(inst); + + if (!self.hasFeature(.fma)) return self.fail("TODO implement airMulAdd for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }); + + const ops = [3]Air.Inst.Ref{ extra.lhs, extra.rhs, pl_op.operand }; + var mcvs: [3]MCValue = undefined; + var locks = [1]?RegisterManager.RegisterLock{null} ** 3; + defer for (locks) |reg_lock| if (reg_lock) |lock| self.register_manager.unlockReg(lock); + var order = [1]u2{0} ** 3; + var unused = std.StaticBitSet(3).initFull(); + for (ops, &mcvs, &locks, 0..) |op, *mcv, *lock, op_i| { + const op_index = @intCast(u2, op_i); + mcv.* = try self.resolveInst(op); + if (unused.isSet(0) and mcv.isRegister() and self.reuseOperand(inst, op, op_index, mcv.*)) { + order[op_index] = 1; + unused.unset(0); + } else if (unused.isSet(2) and mcv.isMemory()) { + order[op_index] = 3; + unused.unset(2); + } + switch (mcv.*) { + .register => |reg| lock.* = self.register_manager.lockReg(reg), + else => {}, + } + } + for (&order, &mcvs, &locks) |*mop_index, *mcv, *lock| { + if (mop_index.* != 0) continue; + mop_index.* = 1 + @intCast(u2, unused.toggleFirstSet().?); + if (mop_index.* > 1 and mcv.isRegister()) continue; + const reg = try self.copyToTmpRegister(ty, mcv.*); + mcv.* = .{ .register = reg }; + if (lock.*) |old_lock| self.register_manager.unlockReg(old_lock); + lock.* = self.register_manager.lockRegAssumeUnused(reg); + } + + const tag: ?Mir.Inst.Tag = + if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 })) + switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .vfmadd132ss, + 64 => .vfmadd132sd, + else => null, + }, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => .vfmadd132ss, + 2...8 => .vfmadd132ps, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => .vfmadd132sd, + 2...4 => .vfmadd132pd, + else => null, + }, + else => null, + }, + else => null, + }, + else => unreachable, + } + else if (mem.eql(u2, &order, &.{ 2, 1, 3 }) or mem.eql(u2, &order, &.{ 1, 2, 3 })) + switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .vfmadd213ss, + 64 => .vfmadd213sd, + else => null, + }, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => .vfmadd213ss, + 2...8 => .vfmadd213ps, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => .vfmadd213sd, + 2...4 => .vfmadd213pd, + else => null, + }, + else => null, + }, + else => null, + }, + else => unreachable, + } + else if (mem.eql(u2, &order, &.{ 2, 3, 1 }) or mem.eql(u2, &order, &.{ 3, 2, 1 })) + switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .vfmadd231ss, + 64 => .vfmadd231sd, + else => null, + }, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => .vfmadd231ss, + 2...8 => .vfmadd231ps, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => .vfmadd231sd, + 2...4 => .vfmadd231pd, + else => null, + }, + else => null, + }, + else => null, + }, + else => null, + } + else + unreachable; + if (tag == null) return self.fail("TODO implement airMulAdd for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }); + + var mops: [3]MCValue = undefined; + for (order, mcvs) |mop_index, mcv| mops[mop_index - 1] = mcv; + + const abi_size = @intCast(u32, ty.abiSize(self.target.*)); + const mop1_reg = registerAlias(mops[0].getReg().?, abi_size); + const mop2_reg = registerAlias(mops[1].getReg().?, abi_size); + if (mops[2].isRegister()) + try self.asmRegisterRegisterRegister( + tag.?, + mop1_reg, + mop2_reg, + registerAlias(mops[2].getReg().?, abi_size), + ) + else + try self.asmRegisterRegisterMemory( + tag.?, + mop1_reg, + mop2_reg, + mops[2].mem(Memory.PtrSize.fromSize(abi_size)), + ); + return self.finishAir(inst, mops[0], ops); } fn resolveInst(self: *Self, ref: Air.Inst.Ref) InnerError!MCValue { diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index ada1e891fb..94bfa63999 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -340,6 +340,11 @@ pub const Mnemonic = enum { vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, // F16C vcvtph2ps, vcvtps2ph, + // FMA + vfmadd132pd, vfmadd213pd, vfmadd231pd, + vfmadd132ps, vfmadd213ps, vfmadd231ps, + vfmadd132sd, vfmadd213sd, vfmadd231sd, + vfmadd132ss, vfmadd213ss, vfmadd231ss, // zig fmt: on }; @@ -368,12 +373,13 @@ pub const Op = enum { r8, r16, r32, r64, rm8, rm16, rm32, rm64, r32_m16, r64_m16, - m8, m16, m32, m64, m80, m128, + m8, m16, m32, m64, m80, m128, m256, rel8, rel16, rel32, m, moffs, sreg, xmm, xmm_m32, xmm_m64, xmm_m128, + ymm, ymm_m256, // zig fmt: on pub fn fromOperand(operand: Instruction.Operand) Op { @@ -385,6 +391,7 @@ pub const Op = enum { .segment => return .sreg, .floating_point => return switch (reg.bitSize()) { 128 => .xmm, + 256 => .ymm, else => unreachable, }, .general_purpose => { @@ -418,6 +425,7 @@ pub const Op = enum { 64 => .m64, 80 => .m80, 128 => .m128, + 256 => .m256, else => unreachable, }; }, @@ -454,7 +462,8 @@ pub const Op = enum { .eax, .r32, .rm32, .r32_m16 => unreachable, .rax, .r64, .rm64, .r64_m16 => unreachable, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable, - .m8, .m16, .m32, .m64, .m80, .m128 => unreachable, + .ymm, .ymm_m256 => unreachable, + .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, .unity => 1, .imm8, .imm8s, .rel8 => 8, .imm16, .imm16s, .rel16 => 16, @@ -468,12 +477,13 @@ pub const Op = enum { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, - .m8, .m16, .m32, .m64, .m80, .m128 => unreachable, + .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, .al, .cl, .r8, .rm8 => 8, .ax, .r16, .rm16 => 16, .eax, .r32, .rm32, .r32_m16 => 32, .rax, .r64, .rm64, .r64_m16 => 64, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, + .ymm, .ymm_m256 => 256, }; } @@ -482,13 +492,14 @@ pub const Op = enum { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, - .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm => unreachable, + .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable, .m8, .rm8 => 8, .m16, .rm16, .r32_m16, .r64_m16 => 16, .m32, .rm32, .xmm_m32 => 32, .m64, .rm64, .xmm_m64 => 64, .m80 => 80, .m128, .xmm_m128 => 128, + .m256, .ymm_m256 => 256, }; } @@ -513,6 +524,7 @@ pub const Op = enum { .rm8, .rm16, .rm32, .rm64, .r32_m16, .r64_m16, .xmm, .xmm_m32, .xmm_m64, .xmm_m128, + .ymm, .ymm_m256, => true, else => false, }; @@ -539,7 +551,7 @@ pub const Op = enum { .r32_m16, .r64_m16, .m8, .m16, .m32, .m64, .m80, .m128, .m, - .xmm_m32, .xmm_m64, .xmm_m128, + .xmm_m32, .xmm_m64, .xmm_m128, .ymm_m256, => true, else => false, }; @@ -562,6 +574,7 @@ pub const Op = enum { .r32_m16, .r64_m16 => .general_purpose, .sreg => .segment, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point, + .ymm, .ymm_m256 => .floating_point, }; } @@ -625,6 +638,7 @@ pub const Feature = enum { none, avx, f16c, + fma, sse, sse2, sse3, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index d82d5ec300..a37f28c0c3 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -205,6 +205,19 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vcvtph2ps, .vcvtps2ph, + + .vfmadd132pd, + .vfmadd213pd, + .vfmadd231pd, + .vfmadd132ps, + .vfmadd213ps, + .vfmadd231ps, + .vfmadd132sd, + .vfmadd213sd, + .vfmadd231sd, + .vfmadd132ss, + .vfmadd213ss, + .vfmadd231ss, => try lower.mirGeneric(inst), .cmps, @@ -288,6 +301,8 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { .rmi_rip, .mri_sib, .mri_rip, + .rrm_sib, + .rrm_rip, .rrmi_sib, .rrmi_rip, => Immediate.u(i), @@ -310,6 +325,7 @@ fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { .mr_sib, .mrr_sib, .mri_sib, + .rrm_sib, .rrmi_sib, .lock_m_sib, .lock_mi_sib_u, @@ -327,6 +343,7 @@ fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { .mr_rip, .mrr_rip, .mri_rip, + .rrm_rip, .rrmi_rip, .lock_m_rip, .lock_mi_rip_u, @@ -449,6 +466,11 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rix.r }, .{ .imm = lower.imm(inst.ops, inst.data.rix.i) }, }, + .rrm_sib, .rrm_rip => &.{ + .{ .reg = inst.data.rrx.r1 }, + .{ .reg = inst.data.rrx.r2 }, + .{ .mem = lower.mem(inst.ops, inst.data.rrx.payload) }, + }, .rrmi_sib, .rrmi_rip => &.{ .{ .reg = inst.data.rrix.r1 }, .{ .reg = inst.data.rrix.r2 }, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index e261f6dc38..92a9a74fbb 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -324,6 +324,31 @@ pub const Inst = struct { /// Convert single-precision floating-point values to 16-bit floating-point values vcvtps2ph, + /// Fused multiply-add of packed double-precision floating-point values + vfmadd132pd, + /// Fused multiply-add of packed double-precision floating-point values + vfmadd213pd, + /// Fused multiply-add of packed double-precision floating-point values + vfmadd231pd, + /// Fused multiply-add of packed single-precision floating-point values + vfmadd132ps, + /// Fused multiply-add of packed single-precision floating-point values + vfmadd213ps, + /// Fused multiply-add of packed single-precision floating-point values + vfmadd231ps, + /// Fused multiply-add of scalar double-precision floating-point values + vfmadd132sd, + /// Fused multiply-add of scalar double-precision floating-point values + vfmadd213sd, + /// Fused multiply-add of scalar double-precision floating-point values + vfmadd231sd, + /// Fused multiply-add of scalar single-precision floating-point values + vfmadd132ss, + /// Fused multiply-add of scalar single-precision floating-point values + vfmadd213ss, + /// Fused multiply-add of scalar single-precision floating-point values + vfmadd231ss, + /// Compare string operands cmps, /// Load string @@ -434,6 +459,12 @@ pub const Inst = struct { /// Register, memory (SIB), immediate (byte) operands. /// Uses `rix` payload with extra data of type `MemorySib`. rmi_sib, + /// Register, register, memory (RIP). + /// Uses `rrix` payload with extra data of type `MemoryRip`. + rrm_rip, + /// Register, register, memory (SIB). + /// Uses `rrix` payload with extra data of type `MemorySib`. + rrm_sib, /// Register, register, memory (RIP), immediate (byte) operands. /// Uses `rrix` payload with extra data of type `MemoryRip`. rrmi_rip, diff --git a/src/arch/x86_64/bits.zig b/src/arch/x86_64/bits.zig index 77dc0cfb7c..b73a37d6cb 100644 --- a/src/arch/x86_64/bits.zig +++ b/src/arch/x86_64/bits.zig @@ -485,7 +485,9 @@ pub const Memory = union(enum) { dword, qword, tbyte, - dqword, + xword, + yword, + zword, pub fn fromSize(size: u32) PtrSize { return switch (size) { @@ -493,7 +495,9 @@ pub const Memory = union(enum) { 2...2 => .word, 3...4 => .dword, 5...8 => .qword, - 9...16 => .dqword, + 9...16 => .xword, + 17...32 => .yword, + 33...64 => .zword, else => unreachable, }; } @@ -505,7 +509,9 @@ pub const Memory = union(enum) { 32 => .dword, 64 => .qword, 80 => .tbyte, - 128 => .dqword, + 128 => .xword, + 256 => .yword, + 512 => .zword, else => unreachable, }; } @@ -517,7 +523,9 @@ pub const Memory = union(enum) { .dword => 32, .qword => 64, .tbyte => 80, - .dqword => 128, + .xword => 128, + .yword => 256, + .zword => 512, }; } }; diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 5d2630e9a8..dd05728e24 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -1016,5 +1016,28 @@ pub const table = [_]Entry{ .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128, .f16c }, .{ .vcvtps2ph, .mri, &.{ .xmm_m64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x1d }, 0, .vex_128, .f16c }, + + // FMA + .{ .vfmadd132pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_128_long, .fma }, + .{ .vfmadd132pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_256_long, .fma }, + .{ .vfmadd213pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_128_long, .fma }, + .{ .vfmadd213pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_256_long, .fma }, + .{ .vfmadd231pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_128_long, .fma }, + .{ .vfmadd231pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_256_long, .fma }, + + .{ .vfmadd132ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_128, .fma }, + .{ .vfmadd132ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_256, .fma }, + .{ .vfmadd213ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_128, .fma }, + .{ .vfmadd213ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_256, .fma }, + .{ .vfmadd231ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_128, .fma }, + .{ .vfmadd231ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_256, .fma }, + + .{ .vfmadd132sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x99 }, 0, .vex_128_long, .fma }, + .{ .vfmadd213sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_128_long, .fma }, + .{ .vfmadd231sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_128_long, .fma }, + + .{ .vfmadd132ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x99 }, 0, .vex_128, .fma }, + .{ .vfmadd213ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_128, .fma }, + .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_128, .fma }, }; // zig fmt: on diff --git a/test/behavior/muladd.zig b/test/behavior/muladd.zig index aa36c99784..8656dc4f45 100644 --- a/test/behavior/muladd.zig +++ b/test/behavior/muladd.zig @@ -1,8 +1,10 @@ +const std = @import("std"); const builtin = @import("builtin"); -const expect = @import("std").testing.expect; +const expect = std.testing.expect; test "@mulAdd" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .fma)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From cba195c1170fff77c5210f023e019d72f13b9614 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sat, 6 May 2023 22:27:39 -0400 Subject: x86_64: implement some float and float vector movement This allows actually storing value of these supported types in registers, and not restricting them to stack slots. --- src/arch/x86_64/CodeGen.zig | 127 +++++++++++++++++++++++++++++++++--------- src/arch/x86_64/Encoding.zig | 18 ++++-- src/arch/x86_64/Lower.zig | 6 ++ src/arch/x86_64/Mir.zig | 12 ++++ src/arch/x86_64/encoder.zig | 13 +++-- src/arch/x86_64/encodings.zig | 30 ++++++++++ test/behavior/math.zig | 3 +- test/behavior/muladd.zig | 10 ++-- 8 files changed, 176 insertions(+), 43 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index fffb814d7f..3e47ef63f6 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2008,6 +2008,11 @@ fn computeFrameLayout(self: *Self) !FrameLayout { }; } +fn getFrameAddrAlignment(self: *Self, frame_addr: FrameAddr) u32 { + const alloc_align = @as(u32, 1) << self.frame_allocs.get(@enumToInt(frame_addr.index)).abi_align; + return @min(alloc_align, @bitCast(u32, frame_addr.off) & (alloc_align - 1)); +} + fn allocFrameIndex(self: *Self, alloc: FrameAlloc) !FrameIndex { const frame_allocs_slice = self.frame_allocs.slice(); const frame_size = frame_allocs_slice.items(.abi_size); @@ -2051,24 +2056,36 @@ fn allocTempRegOrMem(self: *Self, elem_ty: Type, reg_ok: bool) !MCValue { return self.allocRegOrMemAdvanced(elem_ty, null, reg_ok); } -fn allocRegOrMemAdvanced(self: *Self, elem_ty: Type, inst: ?Air.Inst.Index, reg_ok: bool) !MCValue { - const abi_size = math.cast(u32, elem_ty.abiSize(self.target.*)) orelse { +fn allocRegOrMemAdvanced(self: *Self, ty: Type, inst: ?Air.Inst.Index, reg_ok: bool) !MCValue { + const abi_size = math.cast(u32, ty.abiSize(self.target.*)) orelse { const mod = self.bin_file.options.module.?; - return self.fail("type '{}' too big to fit into stack frame", .{elem_ty.fmt(mod)}); + return self.fail("type '{}' too big to fit into stack frame", .{ty.fmt(mod)}); }; - if (reg_ok) { - // Make sure the type can fit in a register before we try to allocate one. - const ptr_bits = self.target.cpu.arch.ptrBitWidth(); - const ptr_bytes: u64 = @divExact(ptr_bits, 8); - if (abi_size <= ptr_bytes) { - if (self.register_manager.tryAllocReg(inst, regClassForType(elem_ty))) |reg| { + if (reg_ok) need_mem: { + if (abi_size <= @as(u32, switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 16, 32, 64, 128 => 16, + 80 => break :need_mem, + else => unreachable, + }, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 16, 32, 64 => if (self.hasFeature(.avx)) 32 else 16, + 80, 128 => break :need_mem, + else => unreachable, + }, + else => break :need_mem, + }, + else => 8, + })) { + if (self.register_manager.tryAllocReg(inst, regClassForType(ty))) |reg| { return MCValue{ .register = registerAlias(reg, abi_size) }; } } } - const frame_index = try self.allocFrameIndex(FrameAlloc.initType(elem_ty, self.target.*)); + const frame_index = try self.allocFrameIndex(FrameAlloc.initType(ty, self.target.*)); return .{ .load_frame = .{ .index = frame_index } }; } @@ -4442,12 +4459,19 @@ fn airRound(self: *Self, inst: Air.Inst.Index, mode: Immediate) !void { }), }; assert(dst_mcv.isRegister()); + const abi_size = @intCast(u32, ty.abiSize(self.target.*)); + const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size); if (src_mcv.isRegister()) - try self.asmRegisterRegisterImmediate(mir_tag, dst_mcv.getReg().?, src_mcv.getReg().?, mode) + try self.asmRegisterRegisterImmediate( + mir_tag, + dst_reg, + registerAlias(src_mcv.getReg().?, abi_size), + mode, + ) else try self.asmRegisterMemoryImmediate( mir_tag, - dst_mcv.getReg().?, + dst_reg, src_mcv.mem(Memory.PtrSize.fromSize(@intCast(u32, ty.abiSize(self.target.*)))), mode, ); @@ -7847,19 +7871,43 @@ fn airAsm(self: *Self, inst: Air.Inst.Index) !void { return self.finishAirResult(inst, result); } -fn movMirTag(self: *Self, ty: Type) !Mir.Inst.Tag { - return switch (ty.zigTypeTag()) { - else => .mov, +fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag { + switch (ty.zigTypeTag()) { + else => return .mov, .Float => switch (ty.floatBits(self.target.*)) { 16 => unreachable, // needs special handling - 32 => .movss, - 64 => .movsd, - 128 => .movaps, - else => return self.fail("TODO movMirTag from {}", .{ - ty.fmt(self.bin_file.options.module.?), - }), + 32 => return if (self.hasFeature(.avx)) .vmovss else .movss, + 64 => return if (self.hasFeature(.avx)) .vmovsd else .movsd, + 128 => return if (self.hasFeature(.avx)) + if (aligned) .vmovaps else .vmovups + else if (aligned) .movaps else .movups, + else => {}, }, - }; + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 16 => unreachable, // needs special handling + 32 => switch (ty.vectorLen()) { + 1 => return if (self.hasFeature(.avx)) .vmovss else .movss, + 2...4 => return if (self.hasFeature(.avx)) + if (aligned) .vmovaps else .vmovups + else if (aligned) .movaps else .movups, + 5...8 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups, + else => {}, + }, + 64 => switch (ty.vectorLen()) { + 1 => return if (self.hasFeature(.avx)) .vmovsd else .movsd, + 2 => return if (self.hasFeature(.avx)) + if (aligned) .vmovaps else .vmovups + else if (aligned) .movaps else .movups, + 3...4 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups, + else => {}, + }, + else => {}, + }, + else => {}, + }, + } + return self.fail("TODO movMirTag for {}", .{ty.fmt(self.bin_file.options.module.?)}); } fn genCopy(self: *Self, ty: Type, dst_mcv: MCValue, src_mcv: MCValue) InnerError!void { @@ -8016,7 +8064,11 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr 0 => return self.genSetReg(dst_reg, ty, .{ .register = reg_off.reg }), else => .lea, }, - .indirect, .load_frame => try self.movMirTag(ty), + .indirect => try self.movMirTag(ty, false), + .load_frame => |frame_addr| try self.movMirTag( + ty, + self.getFrameAddrAlignment(frame_addr) >= ty.abiAlignment(self.target.*), + ), .lea_frame => .lea, else => unreachable, }, @@ -8040,7 +8092,11 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr ) else self.asmRegisterMemory( - try self.movMirTag(ty), + try self.movMirTag(ty, mem.isAlignedGeneric( + u32, + @bitCast(u32, small_addr), + ty.abiAlignment(self.target.*), + )), registerAlias(dst_reg, abi_size), src_mem, ); @@ -8080,7 +8136,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr ) else try self.asmRegisterMemory( - try self.movMirTag(ty), + try self.movMirTag(ty, false), registerAlias(dst_reg, abi_size), src_mem, ); @@ -8194,7 +8250,24 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal ) else try self.asmMemoryRegister( - try self.movMirTag(ty), + try self.movMirTag(ty, switch (base) { + .none => mem.isAlignedGeneric( + u32, + @bitCast(u32, disp), + ty.abiAlignment(self.target.*), + ), + .reg => |reg| switch (reg) { + .es, .cs, .ss, .ds => mem.isAlignedGeneric( + u32, + @bitCast(u32, disp), + ty.abiAlignment(self.target.*), + ), + else => false, + }, + .frame => |frame_index| self.getFrameAddrAlignment( + .{ .index = frame_index, .off = disp }, + ) >= ty.abiAlignment(self.target.*), + }), dst_mem, registerAlias(src_reg, abi_size), ); @@ -8415,7 +8488,7 @@ fn airBitCast(self: *Self, inst: Air.Inst.Index) !void { defer if (operand_lock) |lock| self.register_manager.unlockReg(lock); const dest = try self.allocRegOrMem(inst, true); - try self.genCopy(self.air.typeOfIndex(inst), dest, operand); + try self.genCopy(if (!dest.isMemory() or operand.isMemory()) dst_ty else src_ty, dest, operand); break :result dest; }; return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 94bfa63999..1fd1112aaf 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -206,7 +206,7 @@ pub fn format( try writer.print("+{s} ", .{tag}); }, .m, .mi, .m1, .mc, .vmi => try writer.print("/{d} ", .{encoding.modRmExt()}), - .mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi => try writer.writeAll("/r "), + .mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi, .mvr => try writer.writeAll("/r "), } switch (encoding.data.op_en) { @@ -230,7 +230,7 @@ pub fn format( }; try writer.print("{s} ", .{tag}); }, - .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm => {}, + .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm, .mvr => {}, } try writer.print("{s} ", .{@tagName(encoding.mnemonic)}); @@ -332,7 +332,12 @@ pub const Mnemonic = enum { // SSE4.1 roundsd, roundss, // AVX - vmovddup, vmovshdup, vmovsldup, + vmovapd, vmovaps, + vmovddup, + vmovsd, + vmovshdup, vmovsldup, + vmovss, + vmovupd, vmovups, vpextrw, vpinsrw, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, @@ -357,7 +362,7 @@ pub const OpEn = enum { fd, td, m1, mc, mi, mr, rm, rmi, mri, mrc, - vmi, rvm, rvmi, + vmi, rvm, rvmi, mvr, // zig fmt: on }; @@ -549,9 +554,10 @@ pub const Op = enum { return switch (op) { .rm8, .rm16, .rm32, .rm64, .r32_m16, .r64_m16, - .m8, .m16, .m32, .m64, .m80, .m128, + .m8, .m16, .m32, .m64, .m80, .m128, .m256, .m, - .xmm_m32, .xmm_m64, .xmm_m128, .ymm_m256, + .xmm_m32, .xmm_m64, .xmm_m128, + .ymm_m256, => true, else => false, }; diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index a37f28c0c3..a246a97d4b 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -184,9 +184,15 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .roundsd, .roundss, + .vmovapd, + .vmovaps, .vmovddup, + .vmovsd, .vmovshdup, .vmovsldup, + .vmovss, + .vmovupd, + .vmovups, .vpextrw, .vpinsrw, .vpshufhw, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 92a9a74fbb..de7f2cff53 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -282,12 +282,24 @@ pub const Inst = struct { /// Round scalar single-precision floating-point values roundss, + /// Move aligned packed double-precision floating-point values + vmovapd, + /// Move aligned packed single-precision floating-point values + vmovaps, /// Replicate double floating-point values vmovddup, + /// Move or merge scalar double-precision floating-point value + vmovsd, /// Replicate single floating-point values vmovshdup, /// Replicate single floating-point values vmovsldup, + /// Move or merge scalar single-precision floating-point value + vmovss, + /// Move unaligned packed double-precision floating-point values + vmovupd, + /// Move unaligned packed single-precision floating-point values + vmovups, /// Extract word vpextrw, /// Insert word diff --git a/src/arch/x86_64/encoder.zig b/src/arch/x86_64/encoder.zig index 47211591ec..fa6ce676cb 100644 --- a/src/arch/x86_64/encoder.zig +++ b/src/arch/x86_64/encoder.zig @@ -228,7 +228,7 @@ pub const Instruction = struct { .td => try encoder.imm64(inst.ops[0].mem.moffs.offset), else => { const mem_op = switch (data.op_en) { - .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0], + .m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0], .rm, .rmi, .vmi => inst.ops[1], .rvm, .rvmi => inst.ops[2], else => unreachable, @@ -239,6 +239,7 @@ pub const Instruction = struct { .m, .mi, .m1, .mc, .vmi => enc.modRmExt(), .mr, .mri, .mrc => inst.ops[1].reg.lowEnc(), .rm, .rmi, .rvm, .rvmi => inst.ops[0].reg.lowEnc(), + .mvr => inst.ops[2].reg.lowEnc(), else => unreachable, }; try encoder.modRm_direct(rm, reg.lowEnc()); @@ -248,6 +249,7 @@ pub const Instruction = struct { .m, .mi, .m1, .mc, .vmi => .none, .mr, .mri, .mrc => inst.ops[1], .rm, .rmi, .rvm, .rvmi => inst.ops[0], + .mvr => inst.ops[2], else => unreachable, }; try encodeMemory(enc, mem, op, encoder); @@ -315,7 +317,7 @@ pub const Instruction = struct { } else null, - .vmi, .rvm, .rvmi => unreachable, + .vmi, .rvm, .rvmi, .mvr => unreachable, }; if (segment_override) |seg| { legacy.setSegmentOverride(seg); @@ -350,7 +352,7 @@ pub const Instruction = struct { rex.b = b_x_op.isBaseExtended(); rex.x = b_x_op.isIndexExtended(); }, - .vmi, .rvm, .rvmi => unreachable, + .vmi, .rvm, .rvmi, .mvr => unreachable, } try encoder.rex(rex); @@ -372,10 +374,11 @@ pub const Instruction = struct { switch (op_en) { .np, .i, .zi, .fd, .td, .d => {}, .o, .oi => vex.b = inst.ops[0].reg.isExtended(), - .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi => { + .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi, .mvr => { const r_op = switch (op_en) { .rm, .rmi, .rvm, .rvmi => inst.ops[0], .mr, .mri, .mrc => inst.ops[1], + .mvr => inst.ops[2], .m, .mi, .m1, .mc, .vmi => .none, else => unreachable, }; @@ -383,7 +386,7 @@ pub const Instruction = struct { const b_x_op = switch (op_en) { .rm, .rmi, .vmi => inst.ops[1], - .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0], + .m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0], .rvm, .rvmi => inst.ops[2], else => unreachable, }; diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index dd05728e24..607a87b8d9 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -974,12 +974,42 @@ pub const table = [_]Entry{ .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 }, // AVX + .{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128, .avx }, + .{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128, .avx }, + .{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256, .avx }, + .{ .vmovapd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_256, .avx }, + + .{ .vmovaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .vex_128, .avx }, + .{ .vmovaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .vex_128, .avx }, + .{ .vmovaps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x28 }, 0, .vex_256, .avx }, + .{ .vmovaps, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x29 }, 0, .vex_256, .avx }, + .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128, .avx }, + .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_128, .avx }, + .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_128, .avx }, + .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_128, .avx }, + .{ .vmovsd, .mr, &.{ .m64, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_128, .avx }, + .{ .vmovshdup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .vex_128, .avx }, .{ .vmovsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .vex_128, .avx }, + .{ .vmovss, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_128, .avx }, + .{ .vmovss, .rm, &.{ .xmm, .m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_128, .avx }, + .{ .vmovss, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_128, .avx }, + .{ .vmovss, .mr, &.{ .m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_128, .avx }, + + .{ .vmovupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_128, .avx }, + .{ .vmovupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_128, .avx }, + .{ .vmovupd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_256, .avx }, + .{ .vmovupd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_256, .avx }, + + .{ .vmovups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .vex_128, .avx }, + .{ .vmovups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .vex_128, .avx }, + .{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256, .avx }, + .{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256, .avx }, + .{ .vpextrw, .mri, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128, .avx }, .{ .vpextrw, .mri, &.{ .r64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_long, .avx }, .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128, .avx }, diff --git a/test/behavior/math.zig b/test/behavior/math.zig index 0362bd3a2b..7e16111059 100644 --- a/test/behavior/math.zig +++ b/test/behavior/math.zig @@ -399,7 +399,8 @@ fn testBinaryNot128(comptime Type: type, x: Type) !void { test "division" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO diff --git a/test/behavior/muladd.zig b/test/behavior/muladd.zig index 8656dc4f45..bfb94de270 100644 --- a/test/behavior/muladd.zig +++ b/test/behavior/muladd.zig @@ -2,9 +2,11 @@ const std = @import("std"); const builtin = @import("builtin"); const expect = std.testing.expect; +const stage2_x86_64_without_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and + !std.Target.x86.featureSetHas(builtin.cpu.features, .fma); + test "@mulAdd" { - if (builtin.zig_backend == .stage2_x86_64 and - !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .fma)) return error.SkipZigTest; // TODO + if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -118,7 +120,7 @@ fn vector32() !void { test "vector f32" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -141,7 +143,7 @@ fn vector64() !void { test "vector f64" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From 05580b9453e4ae2d9b62fe4178651937d8b73989 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sun, 7 May 2023 03:14:31 -0400 Subject: x86_64: implement float cast from `f16` to `f64` --- src/arch/x86_64/CodeGen.zig | 95 ++++++++++++++------ src/arch/x86_64/Encoding.zig | 165 ++++++++++++++++++++--------------- src/arch/x86_64/Lower.zig | 4 + src/arch/x86_64/Mir.zig | 8 ++ src/arch/x86_64/encoder.zig | 33 +++---- src/arch/x86_64/encodings.zig | 195 ++++++++++++++++++++++-------------------- test/behavior/floatop.zig | 3 +- 7 files changed, 292 insertions(+), 211 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 3e47ef63f6..38497400f2 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2287,26 +2287,46 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { src_mcv else try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); - const dst_lock = self.register_manager.lockReg(dst_mcv.register); + const dst_reg = dst_mcv.getReg().?.to128(); + const dst_lock = self.register_manager.lockReg(dst_reg); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - if (src_bits == 32 and dst_bits == 16 and self.hasFeature(.f16c)) - try self.asmRegisterRegisterImmediate( - .vcvtps2ph, - dst_mcv.register, - if (src_mcv.isRegister()) src_mcv.getReg().? else src_reg: { - const src_reg = dst_mcv.register; - try self.genSetReg(src_reg, src_ty, src_mcv); - break :src_reg src_reg; + if (dst_bits == 16 and self.hasFeature(.f16c)) { + switch (src_bits) { + 32 => { + const mat_src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + mat_src_reg.to128(), + Immediate.u(0b1_00), + ); }, - Immediate.u(0b1_00), - ) - else if (src_bits == 64 and dst_bits == 32) - try self.genBinOpMir(.cvtsd2ss, src_ty, dst_mcv, src_mcv) - else - return self.fail("TODO implement airFptrunc from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), - }); + else => return self.fail("TODO implement airFptrunc from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }), + } + } else if (src_bits == 64 and dst_bits == 32) { + if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( + .vcvtsd2ss, + dst_reg, + dst_reg, + src_mcv.getReg().?.to128(), + ) else try self.asmRegisterRegisterMemory( + .vcvtsd2ss, + dst_reg, + dst_reg, + src_mcv.mem(.qword), + ) else if (src_mcv.isRegister()) + try self.asmRegisterRegister(.cvtsd2ss, dst_reg, src_mcv.getReg().?.to128()) + else + try self.asmRegisterMemory(.cvtsd2ss, dst_reg, src_mcv.mem(.qword)); + } else return self.fail("TODO implement airFptrunc from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }); return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } @@ -2322,22 +2342,41 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { src_mcv else try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); - const dst_lock = self.register_manager.lockReg(dst_mcv.register); + const dst_reg = dst_mcv.getReg().?.to128(); + const dst_lock = self.register_manager.lockReg(dst_reg); defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - try self.genBinOpMir( - if (src_bits == 16 and dst_bits == 32 and self.hasFeature(.f16c)) - .vcvtph2ps - else if (src_bits == 32 and dst_bits == 64) - .cvtss2sd + if (src_bits == 16 and self.hasFeature(.f16c)) { + const mat_src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? else - return self.fail("TODO implement airFpext from {} to {}", .{ + try self.copyToTmpRegister(src_ty, src_mcv); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, mat_src_reg.to128()); + switch (dst_bits) { + 32 => {}, + 64 => try self.asmRegisterRegisterRegister(.vcvtss2sd, dst_reg, dst_reg, dst_reg), + else => return self.fail("TODO implement airFpext from {} to {}", .{ src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }), - src_ty, - dst_mcv, - src_mcv, - ); + } + } else if (src_bits == 32 and dst_bits == 64) { + if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( + .vcvtss2sd, + dst_reg, + dst_reg, + src_mcv.getReg().?.to128(), + ) else try self.asmRegisterRegisterMemory( + .vcvtss2sd, + dst_reg, + dst_reg, + src_mcv.mem(.dword), + ) else if (src_mcv.isRegister()) + try self.asmRegisterRegister(.cvtss2sd, dst_reg, src_mcv.getReg().?.to128()) + else + try self.asmRegisterMemory(.cvtss2sd, dst_reg, src_mcv.mem(.dword)); + } else return self.fail("TODO implement airFpext from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }); return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 1fd1112aaf..bd6e70c975 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -89,30 +89,13 @@ pub fn findByOpcode(opc: []const u8, prefixes: struct { if (modrm_ext) |ext| if (ext != data.modrm_ext) continue; if (!std.mem.eql(u8, opc, enc.opcode())) continue; if (prefixes.rex.w) { - switch (data.mode) { - .none, .short, .rex, .rex_short, .vex_128, .vex_256 => continue, - .long, .vex_128_long, .vex_256_long => {}, - } + if (!data.mode.isLong()) continue; } else if (prefixes.rex.present and !prefixes.rex.isSet()) { - switch (data.mode) { - .rex, .rex_short => {}, - else => continue, - } + if (!data.mode.isRex()) continue; } else if (prefixes.legacy.prefix_66) { - switch (data.mode) { - .short, .rex_short => {}, - .none, .rex, .vex_128, .vex_256 => continue, - .long, .vex_128_long, .vex_256_long => continue, - } + if (!data.mode.isShort()) continue; } else { - switch (data.mode) { - .none => switch (data.mode) { - .short, .rex_short => continue, - .none, .rex, .vex_128, .vex_256 => {}, - .long, .vex_128_long, .vex_256_long => {}, - }, - else => continue, - } + if (data.mode.isShort()) continue; } return enc; }; @@ -148,50 +131,39 @@ pub fn format( _ = fmt; var opc = encoding.opcode(); - switch (encoding.data.mode) { - else => {}, - .long => try writer.writeAll("REX.W + "), - .vex_128, .vex_128_long, .vex_256, .vex_256_long => { - try writer.writeAll("VEX."); - - switch (encoding.data.mode) { - .vex_128, .vex_128_long => try writer.writeAll("128"), - .vex_256, .vex_256_long => try writer.writeAll("256"), - else => unreachable, - } - - switch (opc[0]) { - else => {}, - 0x66, 0xf3, 0xf2 => { - try writer.print(".{X:0>2}", .{opc[0]}); - opc = opc[1..]; - }, - } + if (encoding.data.mode.isVex()) { + try writer.writeAll("VEX."); + + try writer.writeAll(switch (encoding.data.mode) { + .vex_128_w0, .vex_128_w1, .vex_128_wig => "128", + .vex_256_w0, .vex_256_w1, .vex_256_wig => "256", + .vex_lig_w0, .vex_lig_w1, .vex_lig_wig => "LIG", + .vex_lz_w0, .vex_lz_w1, .vex_lz_wig => "LZ", + else => unreachable, + }); - try writer.print(".{X:0>2}", .{opc[0]}); - opc = opc[1..]; + switch (opc[0]) { + else => {}, + 0x66, 0xf3, 0xf2 => { + try writer.print(".{X:0>2}", .{opc[0]}); + opc = opc[1..]; + }, + } - switch (opc[0]) { - else => {}, - 0x38, 0x3A => { - try writer.print("{X:0>2}", .{opc[0]}); - opc = opc[1..]; - }, - } + try writer.print(".{}", .{std.fmt.fmtSliceHexUpper(opc[0 .. opc.len - 1])}); + opc = opc[opc.len - 1 ..]; - try writer.writeByte('.'); - try writer.writeAll(switch (encoding.data.mode) { - .vex_128, .vex_256 => "W0", - .vex_128_long, .vex_256_long => "W1", - else => unreachable, - }); - try writer.writeByte(' '); - }, - } + try writer.writeAll(".W"); + try writer.writeAll(switch (encoding.data.mode) { + .vex_128_w0, .vex_256_w0, .vex_lig_w0, .vex_lz_w0 => "0", + .vex_128_w1, .vex_256_w1, .vex_lig_w1, .vex_lz_w1 => "1", + .vex_128_wig, .vex_256_wig, .vex_lig_wig, .vex_lz_wig => "IG", + else => unreachable, + }); - for (opc) |byte| { - try writer.print("{x:0>2} ", .{byte}); - } + try writer.writeByte(' '); + } else if (encoding.data.mode.isLong()) try writer.writeAll("REX.W + "); + for (opc) |byte| try writer.print("{x:0>2} ", .{byte}); switch (encoding.data.op_en) { .np, .fd, .td, .i, .zi, .d => {}, @@ -332,6 +304,7 @@ pub const Mnemonic = enum { // SSE4.1 roundsd, roundss, // AVX + vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd, vmovapd, vmovaps, vmovddup, vmovsd, @@ -629,20 +602,74 @@ pub const Op = enum { }; pub const Mode = enum { + // zig fmt: off none, - short, - long, - rex, - rex_short, - vex_128, - vex_128_long, - vex_256, - vex_256_long, + short, long, + rex, rex_short, + vex_128_w0, vex_128_w1, vex_128_wig, + vex_256_w0, vex_256_w1, vex_256_wig, + vex_lig_w0, vex_lig_w1, vex_lig_wig, + vex_lz_w0, vex_lz_w1, vex_lz_wig, + // zig fmt: on + + pub fn isShort(mode: Mode) bool { + return switch (mode) { + .short, .rex_short => true, + else => false, + }; + } + + pub fn isLong(mode: Mode) bool { + return switch (mode) { + .long, + .vex_128_w1, + .vex_256_w1, + .vex_lig_w1, + .vex_lz_w1, + => true, + else => false, + }; + } + + pub fn isRex(mode: Mode) bool { + return switch (mode) { + else => false, + .rex, .rex_short => true, + }; + } + + pub fn isVex(mode: Mode) bool { + return switch (mode) { + // zig fmt: off + else => false, + .vex_128_w0, .vex_128_w1, .vex_128_wig, + .vex_256_w0, .vex_256_w1, .vex_256_wig, + .vex_lig_w0, .vex_lig_w1, .vex_lig_wig, + .vex_lz_w0, .vex_lz_w1, .vex_lz_wig, + => true, + // zig fmt: on + }; + } + + pub fn isVecLong(mode: Mode) bool { + return switch (mode) { + // zig fmt: off + else => unreachable, + .vex_128_w0, .vex_128_w1, .vex_128_wig, + .vex_lig_w0, .vex_lig_w1, .vex_lig_wig, + .vex_lz_w0, .vex_lz_w1, .vex_lz_wig, + => false, + .vex_256_w0, .vex_256_w1, .vex_256_wig, + => true, + // zig fmt: on + }; + } }; pub const Feature = enum { none, avx, + avx2, f16c, fma, sse, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index a246a97d4b..40a5ccdb10 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -184,6 +184,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .roundsd, .roundss, + .vcvtsd2ss, + .vcvtsi2sd, + .vcvtsi2ss, + .vcvtss2sd, .vmovapd, .vmovaps, .vmovddup, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index de7f2cff53..cb1a578bb6 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -282,6 +282,14 @@ pub const Inst = struct { /// Round scalar single-precision floating-point values roundss, + /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value + vcvtsd2ss, + /// Convert doubleword integer to scalar double-precision floating-point value + vcvtsi2sd, + /// Convert doubleword integer to scalar single-precision floating-point value + vcvtsi2ss, + /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value + vcvtss2sd, /// Move aligned packed double-precision floating-point values vmovapd, /// Move aligned packed single-precision floating-point values diff --git a/src/arch/x86_64/encoder.zig b/src/arch/x86_64/encoder.zig index fa6ce676cb..0ce875240d 100644 --- a/src/arch/x86_64/encoder.zig +++ b/src/arch/x86_64/encoder.zig @@ -206,18 +206,15 @@ pub const Instruction = struct { const enc = inst.encoding; const data = enc.data; - switch (data.mode) { - .none, .short, .long, .rex, .rex_short => { - try inst.encodeLegacyPrefixes(encoder); - try inst.encodeMandatoryPrefix(encoder); - try inst.encodeRexPrefix(encoder); - try inst.encodeOpcode(encoder); - }, - .vex_128, .vex_128_long, .vex_256, .vex_256_long => { - try inst.encodeVexPrefix(encoder); - const opc = inst.encoding.opcode(); - try encoder.opcode_1byte(opc[opc.len - 1]); - }, + if (data.mode.isVex()) { + try inst.encodeVexPrefix(encoder); + const opc = inst.encoding.opcode(); + try encoder.opcode_1byte(opc[opc.len - 1]); + } else { + try inst.encodeLegacyPrefixes(encoder); + try inst.encodeMandatoryPrefix(encoder); + try inst.encodeRexPrefix(encoder); + try inst.encodeOpcode(encoder); } switch (data.op_en) { @@ -365,11 +362,7 @@ pub const Instruction = struct { var vex = Vex{}; - vex.w = switch (inst.encoding.data.mode) { - .vex_128, .vex_256 => false, - .vex_128_long, .vex_256_long => true, - else => unreachable, - }; + vex.w = inst.encoding.data.mode.isLong(); switch (op_en) { .np, .i, .zi, .fd, .td, .d => {}, @@ -395,11 +388,7 @@ pub const Instruction = struct { }, } - vex.l = switch (inst.encoding.data.mode) { - .vex_128, .vex_128_long => false, - .vex_256, .vex_256_long => true, - else => unreachable, - }; + vex.l = inst.encoding.data.mode.isVecLong(); vex.p = if (mand_pre) |mand| switch (mand) { 0x66 => .@"66", diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 5096ca5627..5e4dc2f04b 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -918,7 +918,6 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, .{ .pextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .none, .sse2 }, - .{ .pextrw, .rmi, &.{ .r64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .long, .sse2 }, .{ .pinsrw, .rmi, &.{ .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, @@ -926,31 +925,23 @@ pub const table = [_]Entry{ .{ .pshuflw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf2, 0x0f, 0x70 }, 0, .none, .sse2 }, + .{ .psrlw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .none, .sse2 }, + .{ .psrlw, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .none, .sse2 }, .{ .psrld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .none, .sse2 }, .{ .psrld, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 2, .none, .sse2 }, - .{ .psrlq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .none, .sse2 }, .{ .psrlq, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .none, .sse2 }, - .{ .psrlw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .none, .sse2 }, - .{ .psrlw, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .none, .sse2 }, - - .{ .punpckhbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .none, .sse2 }, - - .{ .punpckhdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .none, .sse2 }, - + .{ .punpckhbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .none, .sse2 }, + .{ .punpckhwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .none, .sse2 }, + .{ .punpckhdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .none, .sse2 }, .{ .punpckhqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6d }, 0, .none, .sse2 }, - .{ .punpckhwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .none, .sse2 }, - - .{ .punpcklbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x60 }, 0, .none, .sse2 }, - - .{ .punpckldq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 }, - + .{ .punpcklbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x60 }, 0, .none, .sse2 }, + .{ .punpcklwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x61 }, 0, .none, .sse2 }, + .{ .punpckldq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 }, .{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 }, - .{ .punpcklwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x61 }, 0, .none, .sse2 }, - .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 }, .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, @@ -972,106 +963,128 @@ pub const table = [_]Entry{ // SSE4.1 .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, - .{ .pextrw, .mri, &.{ .r64_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .long, .sse4_1 }, .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 }, // AVX - .{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128, .avx }, - .{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128, .avx }, - .{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256, .avx }, - .{ .vmovapd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_256, .avx }, - - .{ .vmovaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .vex_128, .avx }, - .{ .vmovaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .vex_128, .avx }, - .{ .vmovaps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x28 }, 0, .vex_256, .avx }, - .{ .vmovaps, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x29 }, 0, .vex_256, .avx }, - - .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128, .avx }, - - .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_128, .avx }, - .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_128, .avx }, - .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_128, .avx }, - .{ .vmovsd, .mr, &.{ .m64, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_128, .avx }, - - .{ .vmovshdup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .vex_128, .avx }, + .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, - .{ .vmovsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .vex_128, .avx }, + .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, + .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w1, .avx }, - .{ .vmovss, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_128, .avx }, - .{ .vmovss, .rm, &.{ .xmm, .m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_128, .avx }, - .{ .vmovss, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_128, .avx }, - .{ .vmovss, .mr, &.{ .m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_128, .avx }, + .{ .vcvtsi2ss, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, + .{ .vcvtsi2ss, .rvm, &.{ .xmm, .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w1, .avx }, - .{ .vmovupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_128, .avx }, - .{ .vmovupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_128, .avx }, - .{ .vmovupd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_256, .avx }, - .{ .vmovupd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_256, .avx }, + .{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, - .{ .vmovups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .vex_128, .avx }, - .{ .vmovups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .vex_128, .avx }, - .{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256, .avx }, - .{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256, .avx }, + .{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128_wig, .avx }, + .{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128_wig, .avx }, + .{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256_wig, .avx }, + .{ .vmovapd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_256_wig, .avx }, - .{ .vpextrw, .mri, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128, .avx }, - .{ .vpextrw, .mri, &.{ .r64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_long, .avx }, - .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128, .avx }, - .{ .vpextrw, .mri, &.{ .r64_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_long, .avx }, + .{ .vmovaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .vex_128_wig, .avx }, + .{ .vmovaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .vex_128_wig, .avx }, + .{ .vmovaps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x28 }, 0, .vex_256_wig, .avx }, + .{ .vmovaps, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x29 }, 0, .vex_256_wig, .avx }, - .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128, .avx }, + .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, + .{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx }, - .{ .vpsrld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_128, .avx }, - .{ .vpsrld, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 2, .vex_128, .avx }, + .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, + .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, + .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, + .{ .vmovsd, .mr, &.{ .m64, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, - .{ .vpsrlq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_128, .avx }, - .{ .vpsrlq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_128, .avx }, + .{ .vmovshdup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .vex_128_wig, .avx }, + .{ .vmovshdup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf3, 0x0f, 0x16 }, 0, .vex_256_wig, .avx }, - .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128, .avx }, - .{ .vpsrlw, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_128, .avx }, + .{ .vmovsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, + .{ .vmovsldup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .vex_256_wig, .avx }, - .{ .vpunpckhbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_128, .avx }, + .{ .vmovss, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, + .{ .vmovss, .rm, &.{ .xmm, .m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, + .{ .vmovss, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, + .{ .vmovss, .mr, &.{ .m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, - .{ .vpunpckhdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_128, .avx }, + .{ .vmovupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_128_wig, .avx }, + .{ .vmovupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_128_wig, .avx }, + .{ .vmovupd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x10 }, 0, .vex_256_wig, .avx }, + .{ .vmovupd, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x11 }, 0, .vex_256_wig, .avx }, - .{ .vpunpckhqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6d }, 0, .vex_128, .avx }, + .{ .vmovups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .vex_128_wig, .avx }, + .{ .vmovups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .vex_128_wig, .avx }, + .{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256_wig, .avx }, + .{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256_wig, .avx }, - .{ .vpunpckhwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_128, .avx }, + .{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx }, + .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx }, - .{ .vpunpcklbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x60 }, 0, .vex_128, .avx }, + .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, - .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128, .avx }, + .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx }, + .{ .vpsrlw, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_128_wig, .avx }, + .{ .vpsrld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_128_wig, .avx }, + .{ .vpsrld, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 2, .vex_128_wig, .avx }, + .{ .vpsrlq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_128_wig, .avx }, + .{ .vpsrlq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_128_wig, .avx }, - .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128, .avx }, + .{ .vpunpckhbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_128_wig, .avx }, + .{ .vpunpckhwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_128_wig, .avx }, + .{ .vpunpckhdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_128_wig, .avx }, + .{ .vpunpckhqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6d }, 0, .vex_128_wig, .avx }, - .{ .vpunpcklwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x61 }, 0, .vex_128, .avx }, + .{ .vpunpcklbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x60 }, 0, .vex_128_wig, .avx }, + .{ .vpunpcklwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x61 }, 0, .vex_128_wig, .avx }, + .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx }, + .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx }, // F16C - .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128, .f16c }, + .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c }, + .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c }, - .{ .vcvtps2ph, .mri, &.{ .xmm_m64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x1d }, 0, .vex_128, .f16c }, + .{ .vcvtps2ph, .mri, &.{ .xmm_m64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x1d }, 0, .vex_128_w0, .f16c }, + .{ .vcvtps2ph, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x1d }, 0, .vex_256_w0, .f16c }, // FMA - .{ .vfmadd132pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_128_long, .fma }, - .{ .vfmadd132pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_256_long, .fma }, - .{ .vfmadd213pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_128_long, .fma }, - .{ .vfmadd213pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_256_long, .fma }, - .{ .vfmadd231pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_128_long, .fma }, - .{ .vfmadd231pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_256_long, .fma }, - - .{ .vfmadd132ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_128, .fma }, - .{ .vfmadd132ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_256, .fma }, - .{ .vfmadd213ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_128, .fma }, - .{ .vfmadd213ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_256, .fma }, - .{ .vfmadd231ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_128, .fma }, - .{ .vfmadd231ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_256, .fma }, - - .{ .vfmadd132sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x99 }, 0, .vex_128_long, .fma }, - .{ .vfmadd213sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_128_long, .fma }, - .{ .vfmadd231sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_128_long, .fma }, - - .{ .vfmadd132ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x99 }, 0, .vex_128, .fma }, - .{ .vfmadd213ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_128, .fma }, - .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_128, .fma }, + .{ .vfmadd132pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_128_w1, .fma }, + .{ .vfmadd213pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_128_w1, .fma }, + .{ .vfmadd231pd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_128_w1, .fma }, + .{ .vfmadd132pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_256_w1, .fma }, + .{ .vfmadd213pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_256_w1, .fma }, + .{ .vfmadd231pd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_256_w1, .fma }, + + .{ .vfmadd132ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_128_w0, .fma }, + .{ .vfmadd213ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_128_w0, .fma }, + .{ .vfmadd231ps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_128_w0, .fma }, + .{ .vfmadd132ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x98 }, 0, .vex_256_w0, .fma }, + .{ .vfmadd213ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xa8 }, 0, .vex_256_w0, .fma }, + .{ .vfmadd231ps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0xb8 }, 0, .vex_256_w0, .fma }, + + .{ .vfmadd132sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x99 }, 0, .vex_lig_w1, .fma }, + .{ .vfmadd213sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_lig_w1, .fma }, + .{ .vfmadd231sd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w1, .fma }, + + .{ .vfmadd132ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0x99 }, 0, .vex_lig_w0, .fma }, + .{ .vfmadd213ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xa9 }, 0, .vex_lig_w0, .fma }, + .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w0, .fma }, + + // AVX2 + .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 }, + .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsrld, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x72 }, 2, .vex_256_wig, .avx2 }, + .{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 }, + + .{ .vpunpckhbw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpckhwd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpckhdq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpckhqdq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6d }, 0, .vex_256_wig, .avx2 }, + + .{ .vpunpcklbw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x60 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpcklwd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x61 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpckldq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpcklqdq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_256_wig, .avx2 }, }; // zig fmt: on diff --git a/test/behavior/floatop.zig b/test/behavior/floatop.zig index b98d782da1..ec24407d9f 100644 --- a/test/behavior/floatop.zig +++ b/test/behavior/floatop.zig @@ -52,7 +52,8 @@ fn testFloatComparisons() !void { } test "different sized float comparisons" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .f16c)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From 5c5da179fb930c9d8be9366a851eb4a36f4044f1 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sun, 7 May 2023 03:47:56 -0400 Subject: x86_64: implement `@sqrt` for vectors --- src/arch/x86_64/CodeGen.zig | 221 ++++++++++++++++++++++++++---------------- src/arch/x86_64/Encoding.zig | 1 + src/arch/x86_64/Lower.zig | 4 + src/arch/x86_64/Mir.zig | 8 ++ src/arch/x86_64/encodings.zig | 18 +++- 5 files changed, 164 insertions(+), 88 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 38497400f2..19878bae17 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -4520,25 +4520,69 @@ fn airRound(self: *Self, inst: Air.Inst.Index, mode: Immediate) !void { fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { const un_op = self.air.instructions.items(.data)[inst].un_op; const ty = self.air.typeOf(un_op); + const abi_size = @intCast(u32, ty.abiSize(self.target.*)); const src_mcv = try self.resolveInst(un_op); const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, un_op, 0, src_mcv)) src_mcv else try self.copyToRegisterWithInstTracking(inst, ty, src_mcv); + const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size); + const dst_lock = self.register_manager.lockReg(dst_reg); + defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - try self.genBinOpMir(switch (ty.zigTypeTag()) { - .Float => switch (ty.floatBits(self.target.*)) { - 32 => .sqrtss, - 64 => .sqrtsd, - else => return self.fail("TODO implement airSqrt for {}", .{ - ty.fmt(self.bin_file.options.module.?), - }), + const tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => if (self.hasFeature(.avx)) .vsqrtss else .sqrtss, + 64 => if (self.hasFeature(.avx)) .vsqrtsd else .sqrtsd, + 16, 80, 128 => null, + else => unreachable, }, - else => return self.fail("TODO implement airSqrt for {}", .{ - ty.fmt(self.bin_file.options.module.?), - }), - }, ty, dst_mcv, src_mcv); + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) .vsqrtss else .sqrtss, + 2...4 => if (self.hasFeature(.avx)) .vsqrtps else .sqrtps, + 5...8 => if (self.hasFeature(.avx)) .vsqrtps else null, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) .vsqrtsd else .sqrtsd, + 2 => if (self.hasFeature(.avx)) .vsqrtpd else .sqrtpd, + 3...4 => if (self.hasFeature(.avx)) .vsqrtpd else null, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement airSqrt for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }); + switch (tag) { + .vsqrtss, .vsqrtsd => if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( + tag, + dst_reg, + dst_reg, + registerAlias(src_mcv.getReg().?, abi_size), + ) else try self.asmRegisterRegisterMemory( + tag, + dst_reg, + dst_reg, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ), + else => if (src_mcv.isRegister()) try self.asmRegisterRegister( + tag, + dst_reg, + registerAlias(src_mcv.getReg().?, abi_size), + ) else try self.asmRegisterMemory( + tag, + dst_reg, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ), + } return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none }); } @@ -9544,85 +9588,92 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { lock.* = self.register_manager.lockRegAssumeUnused(reg); } - const tag: ?Mir.Inst.Tag = + const tag = if (@as( + ?Mir.Inst.Tag, if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 })) - switch (ty.zigTypeTag()) { - .Float => switch (ty.floatBits(self.target.*)) { - 32 => .vfmadd132ss, - 64 => .vfmadd132sd, - else => null, - }, - .Vector => switch (ty.childType().zigTypeTag()) { - .Float => switch (ty.childType().floatBits(self.target.*)) { - 32 => switch (ty.vectorLen()) { - 1 => .vfmadd132ss, - 2...8 => .vfmadd132ps, - else => null, - }, - 64 => switch (ty.vectorLen()) { - 1 => .vfmadd132sd, - 2...4 => .vfmadd132pd, - else => null, - }, - else => null, + switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .vfmadd132ss, + 64 => .vfmadd132sd, + 16, 80, 128 => null, + else => unreachable, }, - else => null, - }, - else => unreachable, - } - else if (mem.eql(u2, &order, &.{ 2, 1, 3 }) or mem.eql(u2, &order, &.{ 1, 2, 3 })) - switch (ty.zigTypeTag()) { - .Float => switch (ty.floatBits(self.target.*)) { - 32 => .vfmadd213ss, - 64 => .vfmadd213sd, - else => null, - }, - .Vector => switch (ty.childType().zigTypeTag()) { - .Float => switch (ty.childType().floatBits(self.target.*)) { - 32 => switch (ty.vectorLen()) { - 1 => .vfmadd213ss, - 2...8 => .vfmadd213ps, - else => null, - }, - 64 => switch (ty.vectorLen()) { - 1 => .vfmadd213sd, - 2...4 => .vfmadd213pd, - else => null, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => .vfmadd132ss, + 2...8 => .vfmadd132ps, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => .vfmadd132sd, + 2...4 => .vfmadd132pd, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, }, - else => null, + else => unreachable, }, - else => null, - }, - else => unreachable, - } - else if (mem.eql(u2, &order, &.{ 2, 3, 1 }) or mem.eql(u2, &order, &.{ 3, 2, 1 })) - switch (ty.zigTypeTag()) { - .Float => switch (ty.floatBits(self.target.*)) { - 32 => .vfmadd231ss, - 64 => .vfmadd231sd, - else => null, - }, - .Vector => switch (ty.childType().zigTypeTag()) { - .Float => switch (ty.childType().floatBits(self.target.*)) { - 32 => switch (ty.vectorLen()) { - 1 => .vfmadd231ss, - 2...8 => .vfmadd231ps, - else => null, + else => unreachable, + } + else if (mem.eql(u2, &order, &.{ 2, 1, 3 }) or mem.eql(u2, &order, &.{ 1, 2, 3 })) + switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .vfmadd213ss, + 64 => .vfmadd213sd, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => .vfmadd213ss, + 2...8 => .vfmadd213ps, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => .vfmadd213sd, + 2...4 => .vfmadd213pd, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, }, - 64 => switch (ty.vectorLen()) { - 1 => .vfmadd231sd, - 2...4 => .vfmadd231pd, - else => null, + else => unreachable, + }, + else => unreachable, + } + else if (mem.eql(u2, &order, &.{ 2, 3, 1 }) or mem.eql(u2, &order, &.{ 3, 2, 1 })) + switch (ty.zigTypeTag()) { + .Float => switch (ty.floatBits(self.target.*)) { + 32 => .vfmadd231ss, + 64 => .vfmadd231sd, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (ty.childType().zigTypeTag()) { + .Float => switch (ty.childType().floatBits(self.target.*)) { + 32 => switch (ty.vectorLen()) { + 1 => .vfmadd231ss, + 2...8 => .vfmadd231ps, + else => null, + }, + 64 => switch (ty.vectorLen()) { + 1 => .vfmadd231sd, + 2...4 => .vfmadd231pd, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, }, - else => null, + else => unreachable, }, - else => null, - }, - else => null, - } - else - unreachable; - if (tag == null) return self.fail("TODO implement airMulAdd for {}", .{ + else => unreachable, + } + else + unreachable, + )) |tag| tag else return self.fail("TODO implement airMulAdd for {}", .{ ty.fmt(self.bin_file.options.module.?), }); @@ -9634,14 +9685,14 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { const mop2_reg = registerAlias(mops[1].getReg().?, abi_size); if (mops[2].isRegister()) try self.asmRegisterRegisterRegister( - tag.?, + tag, mop1_reg, mop2_reg, registerAlias(mops[2].getReg().?, abi_size), ) else try self.asmRegisterRegisterMemory( - tag.?, + tag, mop1_reg, mop2_reg, mops[2].mem(Memory.PtrSize.fromSize(abi_size)), diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index bd6e70c975..b242c98bdc 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -316,6 +316,7 @@ pub const Mnemonic = enum { vpsrld, vpsrlq, vpsrlw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, + vsqrtpd, vsqrtps, vsqrtsd, vsqrtss, // F16C vcvtph2ps, vcvtps2ph, // FMA diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 40a5ccdb10..39ad2313e7 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -212,6 +212,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vpunpckldq, .vpunpcklqdq, .vpunpcklwd, + .vsqrtpd, + .vsqrtps, + .vsqrtsd, + .vsqrtss, .vcvtph2ps, .vcvtps2ph, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index cb1a578bb6..b6df0fff09 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -338,6 +338,14 @@ pub const Inst = struct { vpunpcklqdq, /// Unpack low data vpunpcklwd, + /// Square root of packed double-precision floating-point value + vsqrtpd, + /// Square root of packed single-precision floating-point value + vsqrtps, + /// Square root of scalar double-precision floating-point value + vsqrtsd, + /// Square root of scalar single-precision floating-point value + vsqrtss, /// Convert 16-bit floating-point values to single-precision floating-point values vcvtph2ps, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 5e4dc2f04b..49ebc344fd 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -869,8 +869,9 @@ pub const table = [_]Entry{ .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse }, - .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse }, - .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .none, .sse }, + .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse }, + + .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .none, .sse }, .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .none, .sse }, @@ -943,7 +944,8 @@ pub const table = [_]Entry{ .{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 }, .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 }, - .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, + + .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .none, .sse2 }, @@ -1039,6 +1041,16 @@ pub const table = [_]Entry{ .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx }, .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx }, + .{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx }, + .{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx }, + + .{ .vsqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .vex_128_wig, .avx }, + .{ .vsqrtps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x51 }, 0, .vex_256_wig, .avx }, + + .{ .vsqrtsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f }, 0, .vex_lig_wig, .avx }, + + .{ .vsqrtss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f }, 0, .vex_lig_wig, .avx }, + // F16C .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c }, .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c }, -- cgit v1.2.3 From 057139fda575e0e6038b821256a45669cd70a073 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sun, 7 May 2023 09:06:12 -0400 Subject: x86_64: implement binary operations for float vectors --- src/arch/x86_64/CodeGen.zig | 642 +++++++++++++++++++++++++----------------- src/arch/x86_64/Encoding.zig | 34 ++- src/arch/x86_64/Lower.zig | 49 ++++ src/arch/x86_64/Mir.zig | 115 +++++++- src/arch/x86_64/encodings.zig | 101 ++++++- 5 files changed, 651 insertions(+), 290 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 6337ad23f5..8c6f14ec3a 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1176,6 +1176,21 @@ fn asmRegisterRegisterRegister( }); } +fn asmRegisterRegisterRegisterImmediate( + self: *Self, + tag: Mir.Inst.Tag, + reg1: Register, + reg2: Register, + reg3: Register, + imm: Immediate, +) !void { + _ = try self.addInst(.{ + .tag = tag, + .ops = .rrri, + .data = .{ .rrri = .{ .r1 = reg1, .r2 = reg2, .r3 = reg3, .i = @intCast(u8, imm.unsigned) } }, + }); +} + fn asmRegisterRegisterImmediate( self: *Self, tag: Mir.Inst.Tag, @@ -2310,20 +2325,31 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { }), } } else if (src_bits == 64 and dst_bits == 32) { - if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( + if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( .vcvtsd2ss, dst_reg, dst_reg, - src_mcv.getReg().?.to128(), - ) else try self.asmRegisterRegisterMemory( + src_mcv.mem(.qword), + ) else try self.asmRegisterRegisterRegister( .vcvtsd2ss, dst_reg, dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( + .cvtsd2ss, + dst_reg, src_mcv.mem(.qword), - ) else if (src_mcv.isRegister()) - try self.asmRegisterRegister(.cvtsd2ss, dst_reg, src_mcv.getReg().?.to128()) - else - try self.asmRegisterMemory(.cvtsd2ss, dst_reg, src_mcv.mem(.qword)); + ) else try self.asmRegisterRegister( + .cvtsd2ss, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ); } else return self.fail("TODO implement airFptrunc from {} to {}", .{ src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); @@ -2360,20 +2386,31 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { }), } } else if (src_bits == 32 and dst_bits == 64) { - if (self.hasFeature(.avx)) if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( + if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( .vcvtss2sd, dst_reg, dst_reg, - src_mcv.getReg().?.to128(), - ) else try self.asmRegisterRegisterMemory( + src_mcv.mem(.dword), + ) else try self.asmRegisterRegisterRegister( .vcvtss2sd, dst_reg, dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( + .cvtss2sd, + dst_reg, src_mcv.mem(.dword), - ) else if (src_mcv.isRegister()) - try self.asmRegisterRegister(.cvtss2sd, dst_reg, src_mcv.getReg().?.to128()) - else - try self.asmRegisterMemory(.cvtss2sd, dst_reg, src_mcv.mem(.dword)); + ) else try self.asmRegisterRegister( + .cvtss2sd, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv)).to128(), + ); } else return self.fail("TODO implement airFpext from {} to {}", .{ src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }); @@ -4532,7 +4569,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); const result: MCValue = result: { - const tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) { + const mir_tag = if (@as(?Mir.Inst.Tag, switch (ty.zigTypeTag()) { .Float => switch (ty.floatBits(self.target.*)) { 16 => if (self.hasFeature(.f16c)) { const mat_src_reg = if (src_mcv.isRegister()) @@ -4558,11 +4595,14 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { .Float => switch (ty.childType().floatBits(self.target.*)) { 16 => if (self.hasFeature(.f16c)) switch (ty.vectorLen()) { 1 => { - const mat_src_reg = if (src_mcv.isRegister()) - src_mcv.getReg().? - else - try self.copyToTmpRegister(ty, src_mcv); - try self.asmRegisterRegister(.vcvtph2ps, dst_reg, mat_src_reg.to128()); + try self.asmRegisterRegister( + .vcvtph2ps, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv)).to128(), + ); try self.asmRegisterRegisterRegister(.vsqrtss, dst_reg, dst_reg, dst_reg); try self.asmRegisterRegisterImmediate( .vcvtps2ph, @@ -4574,16 +4614,19 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { }, 2...8 => { const wide_reg = registerAlias(dst_reg, abi_size * 2); - if (src_mcv.isRegister()) try self.asmRegisterRegister( - .vcvtph2ps, - wide_reg, - src_mcv.getReg().?.to128(), - ) else try self.asmRegisterMemory( + if (src_mcv.isMemory()) try self.asmRegisterMemory( .vcvtph2ps, wide_reg, src_mcv.mem(Memory.PtrSize.fromSize( @intCast(u32, @divExact(wide_reg.bitSize(), 16)), )), + ) else try self.asmRegisterRegister( + .vcvtph2ps, + wide_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv)).to128(), ); try self.asmRegisterRegister(.vsqrtps, wide_reg, wide_reg); try self.asmRegisterRegisterImmediate( @@ -4617,26 +4660,32 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { })) |tag| tag else return self.fail("TODO implement airSqrt for {}", .{ ty.fmt(self.bin_file.options.module.?), }); - switch (tag) { - .vsqrtss, .vsqrtsd => if (src_mcv.isRegister()) try self.asmRegisterRegisterRegister( - tag, + switch (mir_tag) { + .vsqrtss, .vsqrtsd => if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( + mir_tag, dst_reg, dst_reg, - registerAlias(src_mcv.getReg().?, abi_size), - ) else try self.asmRegisterRegisterMemory( - tag, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegisterRegister( + mir_tag, dst_reg, dst_reg, - src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv), abi_size), ), - else => if (src_mcv.isRegister()) try self.asmRegisterRegister( - tag, - dst_reg, - registerAlias(src_mcv.getReg().?, abi_size), - ) else try self.asmRegisterMemory( - tag, + else => if (src_mcv.isMemory()) try self.asmRegisterMemory( + mir_tag, dst_reg, src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegister( + mir_tag, + dst_reg, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv), abi_size), ), } break :result dst_mcv; @@ -5800,25 +5849,22 @@ fn genMulDivBinOp( } } -/// Result is always a register. fn genBinOp( self: *Self, maybe_inst: ?Air.Inst.Index, - tag: Air.Inst.Tag, + air_tag: Air.Inst.Tag, lhs_air: Air.Inst.Ref, rhs_air: Air.Inst.Ref, ) !MCValue { - const lhs = try self.resolveInst(lhs_air); - const rhs = try self.resolveInst(rhs_air); + const lhs_mcv = try self.resolveInst(lhs_air); + const rhs_mcv = try self.resolveInst(rhs_air); const lhs_ty = self.air.typeOf(lhs_air); const rhs_ty = self.air.typeOf(rhs_air); - if (lhs_ty.zigTypeTag() == .Vector) { - return self.fail("TODO implement genBinOp for {}", .{lhs_ty.fmt(self.bin_file.options.module.?)}); - } + const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*)); - switch (lhs) { + switch (lhs_mcv) { .immediate => |imm| switch (imm) { - 0 => switch (tag) { + 0 => switch (air_tag) { .sub, .subwrap => return self.genUnOp(maybe_inst, .neg, rhs_air), else => {}, }, @@ -5827,9 +5873,10 @@ fn genBinOp( else => {}, } - const is_commutative = switch (tag) { + const is_commutative = switch (air_tag) { .add, .addwrap, + .mul, .bool_or, .bit_or, .bool_and, @@ -5841,48 +5888,42 @@ fn genBinOp( else => false, }; - const dst_mem_ok = switch (tag) { - .add, - .addwrap, - .sub, - .subwrap, - .mul, - .div_float, - .div_exact, - .div_trunc, - .div_floor, - => !lhs_ty.isRuntimeFloat(), - - else => true, + const vec_op = switch (lhs_ty.zigTypeTag()) { + else => false, + .Float, .Vector => true, }; - const lhs_lock: ?RegisterLock = switch (lhs) { + const lhs_lock: ?RegisterLock = switch (lhs_mcv) { .register => |reg| self.register_manager.lockRegAssumeUnused(reg), else => null, }; defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock); - const rhs_lock: ?RegisterLock = switch (rhs) { + const rhs_lock: ?RegisterLock = switch (rhs_mcv) { .register => |reg| self.register_manager.lockReg(reg), else => null, }; defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock); - var flipped: bool = false; + var flipped = false; + var copied_to_dst = true; const dst_mcv: MCValue = dst: { if (maybe_inst) |inst| { - if ((dst_mem_ok or lhs.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs)) { - break :dst lhs; + if ((!vec_op or lhs_mcv.isRegister()) and self.reuseOperand(inst, lhs_air, 0, lhs_mcv)) { + break :dst lhs_mcv; } - if (is_commutative and (dst_mem_ok or rhs.isRegister()) and - self.reuseOperand(inst, rhs_air, 1, rhs)) + if (is_commutative and (!vec_op or rhs_mcv.isRegister()) and + self.reuseOperand(inst, rhs_air, 1, rhs_mcv)) { flipped = true; - break :dst rhs; + break :dst rhs_mcv; } } const dst_mcv = try self.allocRegOrMemAdvanced(lhs_ty, maybe_inst, true); - try self.genCopy(lhs_ty, dst_mcv, lhs); + if (vec_op and lhs_mcv.isRegister() and self.hasFeature(.avx)) + copied_to_dst = false + else + try self.genCopy(lhs_ty, dst_mcv, lhs_mcv); break :dst dst_mcv; }; const dst_lock: ?RegisterLock = switch (dst_mcv) { @@ -5891,160 +5932,47 @@ fn genBinOp( }; defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - const src_mcv = if (flipped) lhs else rhs; - switch (tag) { - .add, - .addwrap, - => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => .add, - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .addss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .addsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv), - - .sub, - .subwrap, - => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => .sub, - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .subss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .subsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv), - - .mul => try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .mulss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .mulsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv), + const src_mcv = if (flipped) lhs_mcv else rhs_mcv; + if (!vec_op) { + switch (air_tag) { + .add, + .addwrap, + => try self.genBinOpMir(.add, lhs_ty, dst_mcv, src_mcv), - .div_float, - .div_exact, - .div_trunc, - .div_floor, - => { - try self.genBinOpMir(switch (lhs_ty.zigTypeTag()) { - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - .Float => switch (lhs_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .divss - else - return self.fail("TODO implement genBinOp for {s} {} without sse", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .divsd - else - return self.fail("TODO implement genBinOp for {s} {} without sse2", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, - }, lhs_ty, dst_mcv, src_mcv); - switch (tag) { - .div_float, - .div_exact, - => {}, - .div_trunc, - .div_floor, - => if (self.hasFeature(.sse4_1)) { - const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*)); - const dst_alias = registerAlias(dst_mcv.register, abi_size); - try self.asmRegisterRegisterImmediate(switch (lhs_ty.floatBits(self.target.*)) { - 32 => .roundss, - 64 => .roundsd, - else => unreachable, - }, dst_alias, dst_alias, Immediate.u(switch (tag) { - .div_trunc => 0b1_0_11, - .div_floor => 0b1_0_01, - else => unreachable, - })); - } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - else => unreachable, - } - }, + .sub, + .subwrap, + => try self.genBinOpMir(.sub, lhs_ty, dst_mcv, src_mcv), - .ptr_add, - .ptr_sub, - => { - const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv); - const tmp_mcv = MCValue{ .register = tmp_reg }; - const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); - defer self.register_manager.unlockReg(tmp_lock); + .ptr_add, + .ptr_sub, + => { + const tmp_reg = try self.copyToTmpRegister(rhs_ty, src_mcv); + const tmp_mcv = MCValue{ .register = tmp_reg }; + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); - const elem_size = lhs_ty.elemType2().abiSize(self.target.*); - try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size }); - try self.genBinOpMir(switch (tag) { - .ptr_add => .add, - .ptr_sub => .sub, - else => unreachable, - }, lhs_ty, dst_mcv, tmp_mcv); - }, + const elem_size = lhs_ty.elemType2().abiSize(self.target.*); + try self.genIntMulComplexOpMir(rhs_ty, tmp_mcv, .{ .immediate = elem_size }); + try self.genBinOpMir(switch (air_tag) { + .ptr_add => .add, + .ptr_sub => .sub, + else => unreachable, + }, lhs_ty, dst_mcv, tmp_mcv); + }, - .bool_or, - .bit_or, - => try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv), + .bool_or, + .bit_or, + => try self.genBinOpMir(.@"or", lhs_ty, dst_mcv, src_mcv), - .bool_and, - .bit_and, - => try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv), + .bool_and, + .bit_and, + => try self.genBinOpMir(.@"and", lhs_ty, dst_mcv, src_mcv), - .xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv), + .xor => try self.genBinOpMir(.xor, lhs_ty, dst_mcv, src_mcv), - .min, - .max, - => switch (lhs_ty.zigTypeTag()) { - .Int => { + .min, + .max, + => { const mat_src_mcv: MCValue = if (switch (src_mcv) { .immediate, .eflags, @@ -6070,12 +5998,12 @@ fn genBinOp( const int_info = lhs_ty.intInfo(self.target.*); const cc: Condition = switch (int_info.signedness) { - .unsigned => switch (tag) { + .unsigned => switch (air_tag) { .min => .a, .max => .b, else => unreachable, }, - .signed => switch (tag) { + .signed => switch (air_tag) { .min => .g, .max => .l, else => unreachable, @@ -6134,26 +6062,222 @@ fn genBinOp( } try self.genCopy(lhs_ty, dst_mcv, .{ .register = tmp_reg }); }, - .Float => try self.genBinOpMir(switch (lhs_ty.floatBits(self.target.*)) { - 32 => switch (tag) { - .min => .minss, - .max => .maxss, - else => unreachable, - }, - 64 => switch (tag) { - .min => .minsd, - .max => .maxsd, - else => unreachable, - }, - else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), - }), - }, lhs_ty, dst_mcv, src_mcv), + else => return self.fail("TODO implement genBinOp for {s} {}", .{ - @tagName(tag), lhs_ty.fmt(self.bin_file.options.module.?), + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), }), - }, + } + return dst_mcv; + } + const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) { + else => unreachable, + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddss else .addss, + .sub => if (self.hasFeature(.avx)) .vsubss else .subss, + .mul => if (self.hasFeature(.avx)) .vmulss else .mulss, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivss else .divss, + .max => if (self.hasFeature(.avx)) .vmaxss else .maxss, + .min => if (self.hasFeature(.avx)) .vminss else .minss, + else => unreachable, + }, + 64 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddsd else .addsd, + .sub => if (self.hasFeature(.avx)) .vsubsd else .subsd, + .mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivsd else .divsd, + .max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd, + .min => if (self.hasFeature(.avx)) .vminsd else .minsd, + else => unreachable, + }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + else => null, + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddss else .addss, + .sub => if (self.hasFeature(.avx)) .vsubss else .subss, + .mul => if (self.hasFeature(.avx)) .vmulss else .mulss, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivss else .divss, + .max => if (self.hasFeature(.avx)) .vmaxss else .maxss, + .min => if (self.hasFeature(.avx)) .vminss else .minss, + else => unreachable, + }, + 2...4 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddps else .addps, + .sub => if (self.hasFeature(.avx)) .vsubps else .subps, + .mul => if (self.hasFeature(.avx)) .vmulps else .mulps, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivps else .divps, + .max => if (self.hasFeature(.avx)) .vmaxps else .maxps, + .min => if (self.hasFeature(.avx)) .vminps else .minps, + else => unreachable, + }, + 5...8 => if (self.hasFeature(.avx)) switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .mul => .vmulps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vminps, + else => unreachable, + } else null, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddsd else .addsd, + .sub => if (self.hasFeature(.avx)) .vsubsd else .subsd, + .mul => if (self.hasFeature(.avx)) .vmulsd else .mulsd, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivsd else .divsd, + .max => if (self.hasFeature(.avx)) .vmaxsd else .maxsd, + .min => if (self.hasFeature(.avx)) .vminsd else .minsd, + else => unreachable, + }, + 2 => switch (air_tag) { + .add => if (self.hasFeature(.avx)) .vaddpd else .addpd, + .sub => if (self.hasFeature(.avx)) .vsubpd else .subpd, + .mul => if (self.hasFeature(.avx)) .vmulpd else .mulpd, + .div_float, + .div_trunc, + .div_floor, + .div_exact, + => if (self.hasFeature(.avx)) .vdivpd else .divpd, + .max => if (self.hasFeature(.avx)) .vmaxpd else .maxpd, + .min => if (self.hasFeature(.avx)) .vminpd else .minpd, + else => unreachable, + }, + 3...4 => if (self.hasFeature(.avx)) switch (air_tag) { + .add => .vaddpd, + .sub => .vsubpd, + .mul => .vmulpd, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivpd, + .max => .vmaxpd, + .min => .vminpd, + else => unreachable, + } else null, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + }, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }); + const dst_alias = registerAlias(dst_mcv.getReg().?, abi_size); + if (self.hasFeature(.avx)) { + const src1_alias = + if (copied_to_dst) dst_alias else registerAlias(lhs_mcv.getReg().?, abi_size); + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( + mir_tag, + dst_alias, + src1_alias, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegisterRegister( + mir_tag, + dst_alias, + src1_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size), + ); + } else { + assert(copied_to_dst); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + mir_tag, + dst_alias, + src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), + ) else try self.asmRegisterRegister( + mir_tag, + dst_alias, + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size), + ); + } + switch (air_tag) { + .add, .sub, .mul, .div_float, .div_exact => {}, + .div_trunc, .div_floor => if (self.hasFeature(.sse4_1)) { + const round_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => if (self.hasFeature(.avx)) .vroundss else .roundss, + 64 => if (self.hasFeature(.avx)) .vroundsd else .roundsd, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) .vroundss else .roundss, + 2...4 => if (self.hasFeature(.avx)) .vroundps else .roundps, + 5...8 => if (self.hasFeature(.avx)) .vroundps else null, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) .vroundsd else .roundsd, + 2 => if (self.hasFeature(.avx)) .vroundpd else .roundpd, + 3...4 => if (self.hasFeature(.avx)) .vroundpd else null, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => null, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }); + const round_mode = Immediate.u(switch (air_tag) { + .div_trunc => 0b1_0_11, + .div_floor => 0b1_0_01, + else => unreachable, + }); + switch (round_tag) { + .vroundss, .vroundsd => try self.asmRegisterRegisterRegisterImmediate( + round_tag, + dst_alias, + dst_alias, + dst_alias, + round_mode, + ), + else => try self.asmRegisterRegisterImmediate( + round_tag, + dst_alias, + dst_alias, + round_mode, + ), + } + } else return self.fail("TODO implement genBinOp for {s} {} without sse4_1", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + .max, .min => {}, // TODO: unordered select else => unreachable, } return dst_mcv; @@ -6186,20 +6310,11 @@ fn genBinOpMir(self: *Self, mir_tag: Mir.Inst.Tag, ty: Type, dst_mcv: MCValue, s .register_overflow, .reserved_frame, => unreachable, - .register => |src_reg| switch (ty.zigTypeTag()) { - .Float => { - if (!Target.x86.featureSetHas(self.target.cpu.features, .sse)) - return self.fail("TODO genBinOpMir for {s} {} without sse", .{ - @tagName(mir_tag), ty.fmt(self.bin_file.options.module.?), - }); - return self.asmRegisterRegister(mir_tag, dst_reg.to128(), src_reg.to128()); - }, - else => try self.asmRegisterRegister( - mir_tag, - dst_alias, - registerAlias(src_reg, abi_size), - ), - }, + .register => |src_reg| try self.asmRegisterRegister( + mir_tag, + dst_alias, + registerAlias(src_reg, abi_size), + ), .immediate => |imm| switch (self.regBitSize(ty)) { 8 => try self.asmRegisterImmediate( mir_tag, @@ -9646,7 +9761,7 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { lock.* = self.register_manager.lockRegAssumeUnused(reg); } - const tag = if (@as( + const mir_tag = if (@as( ?Mir.Inst.Tag, if (mem.eql(u2, &order, &.{ 1, 3, 2 }) or mem.eql(u2, &order, &.{ 3, 1, 2 })) switch (ty.zigTypeTag()) { @@ -9741,20 +9856,17 @@ fn airMulAdd(self: *Self, inst: Air.Inst.Index) !void { const abi_size = @intCast(u32, ty.abiSize(self.target.*)); const mop1_reg = registerAlias(mops[0].getReg().?, abi_size); const mop2_reg = registerAlias(mops[1].getReg().?, abi_size); - if (mops[2].isRegister()) - try self.asmRegisterRegisterRegister( - tag, - mop1_reg, - mop2_reg, - registerAlias(mops[2].getReg().?, abi_size), - ) - else - try self.asmRegisterRegisterMemory( - tag, - mop1_reg, - mop2_reg, - mops[2].mem(Memory.PtrSize.fromSize(abi_size)), - ); + if (mops[2].isRegister()) try self.asmRegisterRegisterRegister( + mir_tag, + mop1_reg, + mop2_reg, + registerAlias(mops[2].getReg().?, abi_size), + ) else try self.asmRegisterRegisterMemory( + mir_tag, + mop1_reg, + mop2_reg, + mops[2].mem(Memory.PtrSize.fromSize(abi_size)), + ); return self.finishAir(inst, mops[0], ops); } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index b242c98bdc..b8ccc9efba 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -262,61 +262,69 @@ pub const Mnemonic = enum { // MMX movd, // SSE - addss, + addps, addss, andps, andnps, cmpss, cvtsi2ss, - divss, - maxss, minss, + divps, divss, + maxps, maxss, + minps, minss, movaps, movss, movups, - mulss, + mulps, mulss, orps, pextrw, pinsrw, - sqrtps, - sqrtss, - subss, + sqrtps, sqrtss, + subps, subss, ucomiss, xorps, // SSE2 - addsd, + addpd, addsd, andpd, andnpd, //cmpsd, cvtsd2ss, cvtsi2sd, cvtss2sd, - divsd, - maxsd, minsd, + divpd, divsd, + maxpd, maxsd, + minpd, minsd, movapd, movq, //movd, movsd, movupd, - mulsd, + mulpd, mulsd, orpd, pshufhw, pshuflw, psrld, psrlq, psrlw, punpckhbw, punpckhdq, punpckhqdq, punpckhwd, punpcklbw, punpckldq, punpcklqdq, punpcklwd, sqrtpd, sqrtsd, - subsd, + subpd, subsd, ucomisd, xorpd, // SSE3 movddup, movshdup, movsldup, // SSE4.1 - roundsd, roundss, + roundpd, roundps, roundsd, roundss, // AVX + vaddpd, vaddps, vaddsd, vaddss, vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd, + vdivpd, vdivps, vdivsd, vdivss, + vmaxpd, vmaxps, vmaxsd, vmaxss, + vminpd, vminps, vminsd, vminss, vmovapd, vmovaps, vmovddup, vmovsd, vmovshdup, vmovsldup, vmovss, vmovupd, vmovups, + vmulpd, vmulps, vmulsd, vmulss, vpextrw, vpinsrw, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, + vroundpd, vroundps, vroundsd, vroundss, vsqrtpd, vsqrtps, vsqrtsd, vsqrtss, + vsubpd, vsubps, vsubsd, vsubss, // F16C vcvtph2ps, vcvtps2ph, // FMA diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 39ad2313e7..2cfa25ac84 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -124,27 +124,34 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .xchg, .xor, + .addps, .addss, .andnps, .andps, .cmpss, .cvtsi2ss, + .divps, .divss, + .maxps, .maxss, + .minps, .minss, .movaps, .movss, .movups, + .mulps, .mulss, .orps, .pextrw, .pinsrw, .sqrtps, .sqrtss, + .subps, .subss, .ucomiss, .xorps, + .addpd, .addsd, .andnpd, .andpd, @@ -152,10 +159,14 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .cvtsd2ss, .cvtsi2sd, .cvtss2sd, + .divpd, .divsd, + .maxpd, .maxsd, + .minpd, .minsd, .movsd, + .mulpd, .mulsd, .orpd, .pshufhw, @@ -173,6 +184,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .punpcklwd, .sqrtpd, .sqrtsd, + .subpd, .subsd, .ucomisd, .xorpd, @@ -181,13 +193,31 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .movshdup, .movsldup, + .roundpd, + .roundps, .roundsd, .roundss, + .vaddpd, + .vaddps, + .vaddsd, + .vaddss, .vcvtsd2ss, .vcvtsi2sd, .vcvtsi2ss, .vcvtss2sd, + .vdivpd, + .vdivps, + .vdivsd, + .vdivss, + .vmaxpd, + .vmaxps, + .vmaxsd, + .vmaxss, + .vminpd, + .vminps, + .vminsd, + .vminss, .vmovapd, .vmovaps, .vmovddup, @@ -197,6 +227,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vmovss, .vmovupd, .vmovups, + .vmulpd, + .vmulps, + .vmulsd, + .vmulss, .vpextrw, .vpinsrw, .vpshufhw, @@ -212,10 +246,18 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vpunpckldq, .vpunpcklqdq, .vpunpcklwd, + .vroundpd, + .vroundps, + .vroundsd, + .vroundss, .vsqrtpd, .vsqrtps, .vsqrtsd, .vsqrtss, + .vsubpd, + .vsubps, + .vsubsd, + .vsubss, .vcvtph2ps, .vcvtps2ph, @@ -304,6 +346,7 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { .lock_mi_rip_s, => Immediate.s(@bitCast(i32, i)), + .rrri, .rri_u, .ri_u, .i_u, @@ -429,6 +472,12 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rrr.r2 }, .{ .reg = inst.data.rrr.r3 }, }, + .rrri => &.{ + .{ .reg = inst.data.rrri.r1 }, + .{ .reg = inst.data.rrri.r2 }, + .{ .reg = inst.data.rrri.r3 }, + .{ .imm = lower.imm(inst.ops, inst.data.rrri.i) }, + }, .ri_s, .ri_u => &.{ .{ .reg = inst.data.ri.r }, .{ .imm = lower.imm(inst.ops, inst.data.ri.i) }, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index b6df0fff09..c0450406cf 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -166,7 +166,9 @@ pub const Inst = struct { /// Logical exclusive-or xor, - /// Add single precision floating point values + /// Add packed single-precision floating-point values + addps, + /// Add scalar single-precision floating-point values addss, /// Bitwise logical and of packed single precision floating-point values andps, @@ -176,11 +178,17 @@ pub const Inst = struct { cmpss, /// Convert doubleword integer to scalar single-precision floating-point value cvtsi2ss, + /// Divide packed single-precision floating-point values + divps, /// Divide scalar single-precision floating-point values divss, - /// Return maximum single-precision floating-point value + /// Maximum of packed single-precision floating-point values + maxps, + /// Maximum of scalar single-precision floating-point values maxss, - /// Return minimum single-precision floating-point value + /// Minimum of packed single-precision floating-point values + minps, + /// Minimum of scalar single-precision floating-point values minss, /// Move aligned packed single-precision floating-point values movaps, @@ -188,6 +196,8 @@ pub const Inst = struct { movss, /// Move unaligned packed single-precision floating-point values movups, + /// Multiply packed single-precision floating-point values + mulps, /// Multiply scalar single-precision floating-point values mulss, /// Bitwise logical or of packed single precision floating-point values @@ -196,18 +206,22 @@ pub const Inst = struct { pextrw, /// Insert word pinsrw, - /// Square root of scalar single precision floating-point value + /// Square root of packed single-precision floating-point values sqrtps, - /// Subtract scalar single-precision floating-point values + /// Square root of scalar single-precision floating-point value sqrtss, - /// Square root of single precision floating-point values + /// Subtract packed single-precision floating-point values + subps, + /// Subtract scalar single-precision floating-point values subss, /// Unordered compare scalar single-precision floating-point values ucomiss, /// Bitwise logical xor of packed single precision floating-point values xorps, - /// Add double precision floating point values + /// Add packed double-precision floating-point values + addpd, + /// Add scalar double-precision floating-point values addsd, /// Bitwise logical and not of packed double precision floating-point values andnpd, @@ -221,14 +235,22 @@ pub const Inst = struct { cvtsi2sd, /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value cvtss2sd, + /// Divide packed double-precision floating-point values + divpd, /// Divide scalar double-precision floating-point values divsd, - /// Return maximum double-precision floating-point value + /// Maximum of packed double-precision floating-point values + maxpd, + /// Maximum of scalar double-precision floating-point values maxsd, - /// Return minimum double-precision floating-point value + /// Minimum of packed double-precision floating-point values + minpd, + /// Minimum of scalar double-precision floating-point values minsd, /// Move scalar double-precision floating-point value movsd, + /// Multiply packed double-precision floating-point values + mulpd, /// Multiply scalar double-precision floating-point values mulsd, /// Bitwise logical or of packed double precision floating-point values @@ -263,6 +285,8 @@ pub const Inst = struct { sqrtpd, /// Square root of scalar double precision floating-point value sqrtsd, + /// Subtract packed double-precision floating-point values + subpd, /// Subtract scalar double-precision floating-point values subsd, /// Unordered compare scalar double-precision floating-point values @@ -277,11 +301,23 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, - /// Round scalar double-precision floating-point values + /// Round packed double-precision floating-point values + roundpd, + /// Round packed single-precision floating-point values + roundps, + /// Round scalar double-precision floating-point value roundsd, - /// Round scalar single-precision floating-point values + /// Round scalar single-precision floating-point value roundss, + /// Add packed double-precision floating-point values + vaddpd, + /// Add packed single-precision floating-point values + vaddps, + /// Add scalar double-precision floating-point values + vaddsd, + /// Add scalar single-precision floating-point values + vaddss, /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value vcvtsd2ss, /// Convert doubleword integer to scalar double-precision floating-point value @@ -290,6 +326,30 @@ pub const Inst = struct { vcvtsi2ss, /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value vcvtss2sd, + /// Divide packed double-precision floating-point values + vdivpd, + /// Divide packed single-precision floating-point values + vdivps, + /// Divide scalar double-precision floating-point values + vdivsd, + /// Divide scalar single-precision floating-point values + vdivss, + /// Maximum of packed double-precision floating-point values + vmaxpd, + /// Maximum of packed single-precision floating-point values + vmaxps, + /// Maximum of scalar double-precision floating-point values + vmaxsd, + /// Maximum of scalar single-precision floating-point values + vmaxss, + /// Minimum of packed double-precision floating-point values + vminpd, + /// Minimum of packed single-precision floating-point values + vminps, + /// Minimum of scalar double-precision floating-point values + vminsd, + /// Minimum of scalar single-precision floating-point values + vminss, /// Move aligned packed double-precision floating-point values vmovapd, /// Move aligned packed single-precision floating-point values @@ -308,6 +368,14 @@ pub const Inst = struct { vmovupd, /// Move unaligned packed single-precision floating-point values vmovups, + /// Multiply packed double-precision floating-point values + vmulpd, + /// Multiply packed single-precision floating-point values + vmulps, + /// Multiply scalar double-precision floating-point values + vmulsd, + /// Multiply scalar single-precision floating-point values + vmulss, /// Extract word vpextrw, /// Insert word @@ -338,6 +406,14 @@ pub const Inst = struct { vpunpcklqdq, /// Unpack low data vpunpcklwd, + /// Round packed double-precision floating-point values + vroundpd, + /// Round packed single-precision floating-point values + vroundps, + /// Round scalar double-precision floating-point value + vroundsd, + /// Round scalar single-precision floating-point value + vroundss, /// Square root of packed double-precision floating-point value vsqrtpd, /// Square root of packed single-precision floating-point value @@ -346,6 +422,14 @@ pub const Inst = struct { vsqrtsd, /// Square root of scalar single-precision floating-point value vsqrtss, + /// Subtract packed double-precision floating-point values + vsubpd, + /// Subtract packed single-precision floating-point values + vsubps, + /// Subtract scalar double-precision floating-point values + vsubsd, + /// Subtract scalar single-precision floating-point values + vsubss, /// Convert 16-bit floating-point values to single-precision floating-point values vcvtph2ps, @@ -442,6 +526,9 @@ pub const Inst = struct { /// Register, register, register operands. /// Uses `rrr` payload. rrr, + /// Register, register, register, immediate (byte) operands. + /// Uses `rrri` payload. + rrri, /// Register, register, immediate (sign-extended) operands. /// Uses `rri` payload. rri_s, @@ -625,6 +712,12 @@ pub const Inst = struct { r2: Register, r3: Register, }, + rrri: struct { + r1: Register, + r2: Register, + r3: Register, + i: u8, + }, rri: struct { r1: Register, r2: Register, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 78bda4fc76..c41f0ea4e7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -837,6 +837,8 @@ pub const table = [_]Entry{ .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none }, // SSE + .{ .addps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .none, .sse }, + .{ .addss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .none, .sse }, .{ .andnps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .none, .sse }, @@ -848,10 +850,16 @@ pub const table = [_]Entry{ .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .none, .sse }, .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .long, .sse }, + .{ .divps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .none, .sse }, + .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .none, .sse }, + .{ .maxps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .none, .sse }, + .{ .maxss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .none, .sse }, + .{ .minps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .none, .sse }, + .{ .minss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .none, .sse }, .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse }, @@ -863,10 +871,14 @@ pub const table = [_]Entry{ .{ .movups, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x10 }, 0, .none, .sse }, .{ .movups, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x11 }, 0, .none, .sse }, + .{ .mulps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .none, .sse }, + .{ .mulss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .none, .sse }, .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse }, + .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse }, + .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse }, .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse }, @@ -878,6 +890,8 @@ pub const table = [_]Entry{ .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse }, // SSE2 + .{ .addpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .none, .sse2 }, + .{ .addsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .none, .sse2 }, .{ .andnpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .none, .sse2 }, @@ -893,10 +907,16 @@ pub const table = [_]Entry{ .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .none, .sse2 }, + .{ .divpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .none, .sse2 }, + .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .none, .sse2 }, + .{ .maxpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .none, .sse2 }, + .{ .maxsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .none, .sse2 }, + .{ .minpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .none, .sse2 }, + .{ .minsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .none, .sse2 }, .{ .movapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .none, .sse2 }, @@ -914,6 +934,8 @@ pub const table = [_]Entry{ .{ .movupd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x10 }, 0, .none, .sse2 }, .{ .movupd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x11 }, 0, .none, .sse2 }, + .{ .mulpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .none, .sse2 }, + .{ .mulsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .none, .sse2 }, .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, @@ -947,6 +969,8 @@ pub const table = [_]Entry{ .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, + .{ .subpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .none, .sse2 }, + .{ .subsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .none, .sse2 }, .{ .movsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .none, .sse2 }, @@ -966,10 +990,25 @@ pub const table = [_]Entry{ // SSE4.1 .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, - .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, + .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, + + .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 }, + .{ .roundsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .none, .sse4_1 }, + .{ .roundss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .none, .sse4_1 }, + // AVX + .{ .vaddpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_128_wig, .avx }, + .{ .vaddpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x58 }, 0, .vex_256_wig, .avx }, + + .{ .vaddps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .vex_128_wig, .avx }, + .{ .vaddps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x58 }, 0, .vex_256_wig, .avx }, + + .{ .vaddsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx }, + + .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx }, + .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, @@ -980,6 +1019,36 @@ pub const table = [_]Entry{ .{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, + .{ .vdivpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_128_wig, .avx }, + .{ .vdivpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_256_wig, .avx }, + + .{ .vdivps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .vex_128_wig, .avx }, + .{ .vdivps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5e }, 0, .vex_256_wig, .avx }, + + .{ .vdivsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx }, + + .{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx }, + + .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx }, + .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx }, + + .{ .vmaxps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5f }, 0, .vex_128_wig, .avx }, + .{ .vmaxps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5f }, 0, .vex_256_wig, .avx }, + + .{ .vmaxsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx }, + + .{ .vmaxss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5f }, 0, .vex_lig_wig, .avx }, + + .{ .vminpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_128_wig, .avx }, + .{ .vminpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5d }, 0, .vex_256_wig, .avx }, + + .{ .vminps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5d }, 0, .vex_128_wig, .avx }, + .{ .vminps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5d }, 0, .vex_256_wig, .avx }, + + .{ .vminsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx }, + + .{ .vminss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5d }, 0, .vex_lig_wig, .avx }, + .{ .vmovapd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_128_wig, .avx }, .{ .vmovapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .vex_128_wig, .avx }, .{ .vmovapd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x28 }, 0, .vex_256_wig, .avx }, @@ -1019,6 +1088,16 @@ pub const table = [_]Entry{ .{ .vmovups, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x10 }, 0, .vex_256_wig, .avx }, .{ .vmovups, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x11 }, 0, .vex_256_wig, .avx }, + .{ .vmulpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_128_wig, .avx }, + .{ .vmulpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x59 }, 0, .vex_256_wig, .avx }, + + .{ .vmulps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x59 }, 0, .vex_128_wig, .avx }, + .{ .vmulps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x59 }, 0, .vex_256_wig, .avx }, + + .{ .vmulsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + + .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + .{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx }, .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx }, @@ -1041,6 +1120,16 @@ pub const table = [_]Entry{ .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx }, .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx }, + .{ .vroundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_128_wig, .avx }, + .{ .vroundpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_256_wig, .avx }, + + .{ .vroundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_128_wig, .avx }, + .{ .vroundps, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .vex_256_wig, .avx }, + + .{ .vroundsd, .rvmi, &.{ .xmm, .xmm, .xmm_m64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0b }, 0, .vex_lig_wig, .avx }, + + .{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx }, + .{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx }, .{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx }, @@ -1051,6 +1140,16 @@ pub const table = [_]Entry{ .{ .vsqrtss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .vex_lig_wig, .avx }, + .{ .vsubpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_128_wig, .avx }, + .{ .vsubpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5c }, 0, .vex_256_wig, .avx }, + + .{ .vsubps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .vex_128_wig, .avx }, + .{ .vsubps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x5c }, 0, .vex_256_wig, .avx }, + + .{ .vsubsd, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx }, + + .{ .vsubss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx }, + // F16C .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c }, .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c }, -- cgit v1.2.3 From 6778da4516e68c271cb50fe9c252ab4084daf16b Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sun, 7 May 2023 20:42:46 -0400 Subject: x86_64: implement binary operations for `f16` and `f16` vectors --- src/arch/x86_64/CodeGen.zig | 261 ++++++++++++++++++++++++++++++++++++++---- src/arch/x86_64/Encoding.zig | 23 ++-- src/arch/x86_64/Lower.zig | 22 ++++ src/arch/x86_64/Mir.zig | 44 +++++++ src/arch/x86_64/encodings.zig | 20 ++++ test/behavior/floatop.zig | 22 ++-- test/behavior/muladd.zig | 8 +- 7 files changed, 354 insertions(+), 46 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 3e2d418105..154b909a21 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -4497,14 +4497,15 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void { const tag = self.air.instructions.items(.tag)[inst]; try self.genBinOpMir(switch (ty_bits) { // No point using an extra prefix byte for *pd which performs the same operation. - 32, 64 => switch (tag) { + 16, 32, 64, 128 => switch (tag) { .neg => .xorps, .fabs => .andnps, else => unreachable, }, - else => return self.fail("TODO implement airFloatSign for {}", .{ + 80 => return self.fail("TODO implement airFloatSign for {}", .{ ty.fmt(self.bin_file.options.module.?), }), + else => unreachable, }, vec_ty, dst_mcv, sign_mcv); return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none }); } @@ -6112,9 +6113,53 @@ fn genBinOp( return dst_mcv; } + const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size); const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) { else => unreachable, .Float => switch (lhs_ty.floatBits(self.target.*)) { + 16 => if (self.hasFeature(.f16c)) { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .vpinsrw, + dst_reg, + dst_reg, + src_mcv.mem(.word), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vpunpcklwd, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddss, + .sub => .vsubss, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivss, + .max => .vmaxss, + .min => .vmaxss, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + } else null, 32 => switch (air_tag) { .add => if (self.hasFeature(.avx)) .vaddss else .addss, .sub => if (self.hasFeature(.avx)) .vsubss else .subss, @@ -6141,12 +6186,178 @@ fn genBinOp( .min => if (self.hasFeature(.avx)) .vminsd else .minsd, else => unreachable, }, - 16, 80, 128 => null, + 80, 128 => null, else => unreachable, }, .Vector => switch (lhs_ty.childType().zigTypeTag()) { else => null, .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) { + 1 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .vpinsrw, + dst_reg, + dst_reg, + src_mcv.mem(.word), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vpunpcklwd, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddss, + .sub => .vsubss, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivss, + .max => .vmaxss, + .min => .vmaxss, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + 2 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + .vpinsrd, + dst_reg, + src_mcv.mem(.dword), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vunpcklps, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegisterRegister(.vmovhlps, tmp_reg, dst_reg, dst_reg); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vmaxps, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + 3...4 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .vcvtph2ps, + tmp_reg, + src_mcv.mem(.qword), + ) else try self.asmRegisterRegister( + .vcvtph2ps, + tmp_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vmaxps, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + 5...8 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to256(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + try self.asmRegisterRegister(.vcvtph2ps, dst_reg.to256(), dst_reg); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .vcvtph2ps, + tmp_reg, + src_mcv.mem(.xword), + ) else try self.asmRegisterRegister( + .vcvtph2ps, + tmp_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vmaxps, + else => unreachable, + }, + dst_reg.to256(), + dst_reg.to256(), + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg.to256(), + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + else => null, + } else null, 32 => switch (lhs_ty.vectorLen()) { 1 => switch (air_tag) { .add => if (self.hasFeature(.avx)) .vaddss else .addss, @@ -6223,14 +6434,13 @@ fn genBinOp( } else null, else => null, }, - 16, 80, 128 => null, + 80, 128 => null, else => unreachable, }, }, })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), }); - const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size); if (self.hasFeature(.avx)) { const src1_alias = if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size); @@ -7139,21 +7349,21 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { const tmp2_lock = self.register_manager.lockRegAssumeUnused(tmp2_reg); defer self.register_manager.unlockReg(tmp2_lock); - if (src_mcv.isRegister()) - try self.asmRegisterRegisterRegister( - .vpunpcklwd, - tmp1_reg, - dst_reg.to128(), - src_mcv.getReg().?.to128(), - ) - else - try self.asmRegisterRegisterMemoryImmediate( - .vpinsrw, - tmp1_reg, - dst_reg.to128(), - src_mcv.mem(.word), - Immediate.u(1), - ); + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .vpinsrw, + tmp1_reg, + dst_reg.to128(), + src_mcv.mem(.word), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vpunpcklwd, + tmp1_reg, + dst_reg.to128(), + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv)).to128(), + ); try self.asmRegisterRegister(.vcvtph2ps, tmp1_reg, tmp1_reg); try self.asmRegisterRegister(.vmovshdup, tmp2_reg, tmp1_reg); try self.genBinOpMir(.ucomiss, ty, tmp1_mcv, tmp2_mcv); @@ -8139,7 +8349,16 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag { }, .Vector => switch (ty.childType().zigTypeTag()) { .Float => switch (ty.childType().floatBits(self.target.*)) { - 16 => unreachable, // needs special handling + 16 => switch (ty.vectorLen()) { + 1 => unreachable, // needs special handling + 2 => return if (self.hasFeature(.avx)) .vmovss else .movss, + 3...4 => return if (self.hasFeature(.avx)) .vmovsd else .movsd, + 5...8 => return if (self.hasFeature(.avx)) + if (aligned) .vmovaps else .vmovups + else if (aligned) .movaps else .movups, + 9...16 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups, + else => {}, + }, 32 => switch (ty.vectorLen()) { 1 => return if (self.hasFeature(.avx)) .vmovss else .movss, 2...4 => return if (self.hasFeature(.avx)) diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index b8ccc9efba..3235b29358 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -270,7 +270,7 @@ pub const Mnemonic = enum { divps, divss, maxps, maxss, minps, minss, - movaps, movss, movups, + movaps, movhlps, movss, movups, mulps, mulss, orps, pextrw, pinsrw, @@ -303,6 +303,8 @@ pub const Mnemonic = enum { // SSE3 movddup, movshdup, movsldup, // SSE4.1 + pextrb, pextrd, pextrq, + pinsrb, pinsrd, pinsrq, roundpd, roundps, roundsd, roundss, // AVX vaddpd, vaddps, vaddsd, vaddss, @@ -311,13 +313,14 @@ pub const Mnemonic = enum { vmaxpd, vmaxps, vmaxsd, vmaxss, vminpd, vminps, vminsd, vminss, vmovapd, vmovaps, - vmovddup, + vmovddup, vmovhlps, vmovsd, vmovshdup, vmovsldup, vmovss, vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, - vpextrw, vpinsrw, + vpextrb, vpextrd, vpextrq, vpextrw, + vpinsrb, vpinsrd, vpinsrq, vpinsrw, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, @@ -359,7 +362,7 @@ pub const Op = enum { cl, r8, r16, r32, r64, rm8, rm16, rm32, rm64, - r32_m16, r64_m16, + r32_m8, r32_m16, r64_m16, m8, m16, m32, m64, m80, m128, m256, rel8, rel16, rel32, m, @@ -444,7 +447,7 @@ pub const Op = enum { pub fn immBitSize(op: Op) u64 { return switch (op) { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, - .al, .cl, .r8, .rm8 => unreachable, + .al, .cl, .r8, .rm8, .r32_m8 => unreachable, .ax, .r16, .rm16 => unreachable, .eax, .r32, .rm32, .r32_m16 => unreachable, .rax, .r64, .rm64, .r64_m16 => unreachable, @@ -467,7 +470,7 @@ pub const Op = enum { .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, .al, .cl, .r8, .rm8 => 8, .ax, .r16, .rm16 => 16, - .eax, .r32, .rm32, .r32_m16 => 32, + .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32, .rax, .r64, .rm64, .r64_m16 => 64, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, .ymm, .ymm_m256 => 256, @@ -480,7 +483,7 @@ pub const Op = enum { .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable, - .m8, .rm8 => 8, + .m8, .rm8, .r32_m8 => 8, .m16, .rm16, .r32_m16, .r64_m16 => 16, .m32, .rm32, .xmm_m32 => 32, .m64, .rm64, .xmm_m64 => 64, @@ -509,7 +512,7 @@ pub const Op = enum { .al, .ax, .eax, .rax, .r8, .r16, .r32, .r64, .rm8, .rm16, .rm32, .rm64, - .r32_m16, .r64_m16, + .r32_m8, .r32_m16, .r64_m16, .xmm, .xmm_m32, .xmm_m64, .xmm_m128, .ymm, .ymm_m256, => true, @@ -535,7 +538,7 @@ pub const Op = enum { // zig fmt: off return switch (op) { .rm8, .rm16, .rm32, .rm64, - .r32_m16, .r64_m16, + .r32_m8, .r32_m16, .r64_m16, .m8, .m16, .m32, .m64, .m80, .m128, .m256, .m, .xmm_m32, .xmm_m64, .xmm_m128, @@ -559,7 +562,7 @@ pub const Op = enum { .al, .ax, .eax, .rax, .cl => .general_purpose, .r8, .r16, .r32, .r64 => .general_purpose, .rm8, .rm16, .rm32, .rm64 => .general_purpose, - .r32_m16, .r64_m16 => .general_purpose, + .r32_m8, .r32_m16, .r64_m16 => .general_purpose, .sreg => .segment, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point, .ymm, .ymm_m256 => .floating_point, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 2cfa25ac84..5c079f4768 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -137,6 +137,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .minps, .minss, .movaps, + .movhlps, .movss, .movups, .mulps, @@ -149,6 +150,8 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .subps, .subss, .ucomiss, + .unpckhps, + .unpcklps, .xorps, .addpd, @@ -187,12 +190,20 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .subpd, .subsd, .ucomisd, + .unpckhpd, + .unpcklpd, .xorpd, .movddup, .movshdup, .movsldup, + .pextrb, + .pextrd, + .pextrq, + .pinsrb, + .pinsrd, + .pinsrq, .roundpd, .roundps, .roundsd, @@ -221,6 +232,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vmovapd, .vmovaps, .vmovddup, + .vmovhlps, .vmovsd, .vmovshdup, .vmovsldup, @@ -231,7 +243,13 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vmulps, .vmulsd, .vmulss, + .vpextrb, + .vpextrd, + .vpextrq, .vpextrw, + .vpinsrb, + .vpinsrd, + .vpinsrq, .vpinsrw, .vpshufhw, .vpshuflw, @@ -258,6 +276,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vsubps, .vsubsd, .vsubss, + .vunpckhpd, + .vunpckhps, + .vunpcklpd, + .vunpcklps, .vcvtph2ps, .vcvtps2ph, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index c0450406cf..442cfabebb 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -192,6 +192,8 @@ pub const Inst = struct { minss, /// Move aligned packed single-precision floating-point values movaps, + /// Move packed single-precision floating-point values high to low + movhlps, /// Move scalar single-precision floating-point value movss, /// Move unaligned packed single-precision floating-point values @@ -216,6 +218,10 @@ pub const Inst = struct { subss, /// Unordered compare scalar single-precision floating-point values ucomiss, + /// Unpack and interleave high packed single-precision floating-point values + unpckhps, + /// Unpack and interleave low packed single-precision floating-point values + unpcklps, /// Bitwise logical xor of packed single precision floating-point values xorps, @@ -291,6 +297,10 @@ pub const Inst = struct { subsd, /// Unordered compare scalar double-precision floating-point values ucomisd, + /// Unpack and interleave high packed double-precision floating-point values + unpckhpd, + /// Unpack and interleave low packed double-precision floating-point values + unpcklpd, /// Bitwise logical xor of packed double precision floating-point values xorpd, @@ -301,6 +311,18 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, + /// Extract Byte + pextrb, + /// Extract Doubleword + pextrd, + /// Extract Quadword + pextrq, + /// Insert Byte + pinsrb, + /// Insert Doubleword + pinsrd, + /// Insert Quadword + pinsrq, /// Round packed double-precision floating-point values roundpd, /// Round packed single-precision floating-point values @@ -354,6 +376,8 @@ pub const Inst = struct { vmovapd, /// Move aligned packed single-precision floating-point values vmovaps, + /// Move packed single-precision floating-point values high to low + vmovhlps, /// Replicate double floating-point values vmovddup, /// Move or merge scalar double-precision floating-point value @@ -376,8 +400,20 @@ pub const Inst = struct { vmulsd, /// Multiply scalar single-precision floating-point values vmulss, + /// Extract Byte + vpextrb, + /// Extract Doubleword + vpextrd, + /// Extract Quadword + vpextrq, /// Extract word vpextrw, + /// Insert Byte + vpinsrb, + /// Insert Doubleword + vpinsrd, + /// Insert Quadword + vpinsrq, /// Insert word vpinsrw, /// Shuffle packed high words @@ -430,6 +466,14 @@ pub const Inst = struct { vsubsd, /// Subtract scalar single-precision floating-point values vsubss, + /// Unpack and interleave high packed double-precision floating-point values + vunpckhpd, + /// Unpack and interleave high packed single-precision floating-point values + vunpckhps, + /// Unpack and interleave low packed double-precision floating-point values + vunpcklpd, + /// Unpack and interleave low packed single-precision floating-point values + vunpcklps, /// Convert 16-bit floating-point values to single-precision floating-point values vcvtph2ps, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index c41f0ea4e7..2b9d530c1e 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -865,6 +865,8 @@ pub const table = [_]Entry{ .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse }, .{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .none, .sse }, + .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse }, + .{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse }, .{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse }, @@ -988,8 +990,16 @@ pub const table = [_]Entry{ .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 }, // SSE4.1 + .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 }, + .{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 }, + .{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 }, + .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, + .{ .pinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 }, + .{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 }, + .{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 }, + .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 }, @@ -1062,6 +1072,8 @@ pub const table = [_]Entry{ .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, .{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx }, + .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, + .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, @@ -1098,9 +1110,17 @@ pub const table = [_]Entry{ .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx }, + .{ .vpextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx }, + .{ .vpextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx }, + .{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx }, .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx }, + .{ .vpinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx }, + .{ .vpinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx }, + .{ .vpinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx }, + .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx }, diff --git a/test/behavior/floatop.zig b/test/behavior/floatop.zig index 3d46c267d3..242c8dabe5 100644 --- a/test/behavior/floatop.zig +++ b/test/behavior/floatop.zig @@ -8,6 +8,8 @@ const has_f80_rt = switch (builtin.cpu.arch) { .x86_64, .x86 => true, else => false, }; +const no_x86_64_hardware_f16_support = builtin.zig_backend == .stage2_x86_64 and + !std.Target.x86.featureSetHas(builtin.cpu.features, .f16c); const epsilon_16 = 0.001; const epsilon = 0.000001; @@ -52,8 +54,7 @@ fn testFloatComparisons() !void { } test "different sized float comparisons" { - if (builtin.zig_backend == .stage2_x86_64 and - !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .f16c)) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -152,7 +153,7 @@ fn testSqrtWithVectors() !void { } test "more @sqrt f16 tests" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -202,7 +203,7 @@ fn testSqrtLegacy(comptime T: type, x: T) !void { } test "@sin" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -241,7 +242,7 @@ fn testSinWithVectors() !void { } test "@cos" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -280,7 +281,7 @@ fn testCosWithVectors() !void { } test "@exp" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -318,7 +319,7 @@ fn testExpWithVectors() !void { } test "@exp2" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -403,7 +404,7 @@ test "@log with @vectors" { } test "@log2" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -445,7 +446,7 @@ fn testLog2WithVectors() !void { } test "@log10" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -881,7 +882,7 @@ fn testTruncLegacy(comptime T: type, x: T) !void { } test "negation f16" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_f16_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -1040,7 +1041,6 @@ test "comptime_float zero divided by zero produces zero" { } test "nan negation f16" { - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO diff --git a/test/behavior/muladd.zig b/test/behavior/muladd.zig index bfb94de270..199f117e7b 100644 --- a/test/behavior/muladd.zig +++ b/test/behavior/muladd.zig @@ -2,11 +2,11 @@ const std = @import("std"); const builtin = @import("builtin"); const expect = std.testing.expect; -const stage2_x86_64_without_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and +const no_x86_64_hardware_fma_support = builtin.zig_backend == .stage2_x86_64 and !std.Target.x86.featureSetHas(builtin.cpu.features, .fma); test "@mulAdd" { - if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -120,7 +120,7 @@ fn vector32() !void { test "vector f32" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -143,7 +143,7 @@ fn vector64() !void { test "vector f64" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (stage2_x86_64_without_hardware_fma_support) return error.SkipZigTest; // TODO + if (no_x86_64_hardware_fma_support) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From 6c14eb2863c7c00f809c5e447ceb8186b55f2eef Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 8 May 2023 06:50:18 -0400 Subject: x86_64: optimize mir tag usage This moves all pseudo-instructions to a single `Mir.Inst.Tag` tag and prepares to start coalescing similar mnemonics. 239 tags left in use. --- src/arch/x86_64/CodeGen.zig | 403 +++++++++++++++--------- src/arch/x86_64/Emit.zig | 65 ++-- src/arch/x86_64/Encoding.zig | 2 +- src/arch/x86_64/Lower.zig | 713 +++++++++++++------------------------------ src/arch/x86_64/Mir.zig | 517 +++++++++++++++++++++---------- src/arch/x86_64/bits.zig | 3 - 6 files changed, 839 insertions(+), 864 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 154b909a21..3ac05c95ac 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -973,14 +973,14 @@ fn addInst(self: *Self, inst: Mir.Inst) error{OutOfMemory}!Mir.Inst.Index { try self.mir_instructions.ensureUnusedCapacity(gpa, 1); const result_index = @intCast(Mir.Inst.Index, self.mir_instructions.len); self.mir_instructions.appendAssumeCapacity(inst); - switch (inst.tag) { - else => wip_mir_log.debug("{}", .{self.fmtWipMir(result_index)}), - .dbg_line, - .dbg_prologue_end, - .dbg_epilogue_begin, - .dead, - => {}, - } + if (inst.tag != .pseudo or switch (inst.ops) { + else => true, + .pseudo_dbg_prologue_end_none, + .pseudo_dbg_line_line_column, + .pseudo_dbg_epilogue_begin_none, + .pseudo_dead_none, + => false, + }) wip_mir_log.debug("{}", .{self.fmtWipMir(result_index)}); return result_index; } @@ -1003,35 +1003,57 @@ fn addExtraAssumeCapacity(self: *Self, extra: anytype) u32 { return result; } -fn asmSetccRegister(self: *Self, reg: Register, cc: bits.Condition) !void { +/// A `cc` of `.z_and_np` clobbers `reg2`! +fn asmCmovccRegisterRegister(self: *Self, reg1: Register, reg2: Register, cc: bits.Condition) !void { _ = try self.addInst(.{ - .tag = .setcc, - .ops = .r_cc, - .data = .{ .r_cc = .{ - .r = reg, - .scratch = if (cc == .z_and_np or cc == .nz_or_p) - (try self.register_manager.allocReg(null, gp)).to8() - else - .none, - .cc = cc, + .tag = switch (cc) { + else => .cmov, + .z_and_np, .nz_or_p => .pseudo, + }, + .ops = switch (cc) { + else => .rr, + .z_and_np => .pseudo_cmov_z_and_np_rr, + .nz_or_p => .pseudo_cmov_nz_or_p_rr, + }, + .data = .{ .rr = .{ + .fixes = switch (cc) { + else => Mir.Inst.Fixes.fromCondition(cc), + .z_and_np, .nz_or_p => ._, + }, + .r1 = reg1, + .r2 = reg2, } }, }); } -fn asmSetccMemory(self: *Self, m: Memory, cc: bits.Condition) !void { +/// A `cc` of `.z_and_np` is not supported by this encoding! +fn asmCmovccRegisterMemory(self: *Self, reg: Register, m: Memory, cc: bits.Condition) !void { _ = try self.addInst(.{ - .tag = .setcc, - .ops = switch (m) { - .sib => .m_sib_cc, - .rip => .m_rip_cc, - else => unreachable, + .tag = switch (cc) { + else => .cmov, + .z_and_np => unreachable, + .nz_or_p => .pseudo, }, - .data = .{ .x_cc = .{ - .scratch = if (cc == .z_and_np or cc == .nz_or_p) - (try self.register_manager.allocReg(null, gp)).to8() - else - .none, - .cc = cc, + .ops = switch (cc) { + else => switch (m) { + .sib => .rm_sib, + .rip => .rm_rip, + else => unreachable, + }, + .z_and_np => unreachable, + .nz_or_p => switch (m) { + .sib => .pseudo_cmov_nz_or_p_rm_sib, + .rip => .pseudo_cmov_nz_or_p_rm_rip, + else => unreachable, + }, + }, + .data = .{ .rx = .{ + .fixes = switch (cc) { + else => Mir.Inst.Fixes.fromCondition(cc), + .z_and_np => unreachable, + .nz_or_p => ._, + }, + .r1 = reg, .payload = switch (m) { .sib => try self.addExtra(Mir.MemorySib.encode(m)), .rip => try self.addExtra(Mir.MemoryRip.encode(m)), @@ -1041,60 +1063,106 @@ fn asmSetccMemory(self: *Self, m: Memory, cc: bits.Condition) !void { }); } -/// A `cc` of `.z_and_np` clobbers `reg2`! -fn asmCmovccRegisterRegister(self: *Self, reg1: Register, reg2: Register, cc: bits.Condition) !void { +fn asmSetccRegister(self: *Self, reg: Register, cc: bits.Condition) !void { _ = try self.addInst(.{ - .tag = .cmovcc, - .ops = .rr_cc, - .data = .{ .rr_cc = .{ - .r1 = reg1, - .r2 = reg2, - .cc = cc, - } }, + .tag = switch (cc) { + else => .set, + .z_and_np, .nz_or_p => .pseudo, + }, + .ops = switch (cc) { + else => .r, + .z_and_np => .pseudo_set_z_and_np_r, + .nz_or_p => .pseudo_set_nz_or_p_r, + }, + .data = switch (cc) { + else => .{ .r = .{ + .fixes = Mir.Inst.Fixes.fromCondition(cc), + .r1 = reg, + } }, + .z_and_np, .nz_or_p => .{ .r_scratch = .{ + .r1 = reg, + .scratch_reg = (try self.register_manager.allocReg(null, gp)).to8(), + } }, + }, }); } -fn asmCmovccRegisterMemory(self: *Self, reg: Register, m: Memory, cc: bits.Condition) !void { - assert(cc != .z_and_np); // not supported +fn asmSetccMemory(self: *Self, m: Memory, cc: bits.Condition) !void { + const payload = switch (m) { + .sib => try self.addExtra(Mir.MemorySib.encode(m)), + .rip => try self.addExtra(Mir.MemoryRip.encode(m)), + else => unreachable, + }; _ = try self.addInst(.{ - .tag = .cmovcc, - .ops = switch (m) { - .sib => .rm_sib_cc, - .rip => .rm_rip_cc, - else => unreachable, + .tag = switch (cc) { + else => .set, + .z_and_np, .nz_or_p => .pseudo, }, - .data = .{ .rx_cc = .{ - .r = reg, - .cc = cc, - .payload = switch (m) { - .sib => try self.addExtra(Mir.MemorySib.encode(m)), - .rip => try self.addExtra(Mir.MemoryRip.encode(m)), + .ops = switch (cc) { + else => switch (m) { + .sib => .m_sib, + .rip => .m_rip, else => unreachable, }, - } }, + .z_and_np => switch (m) { + .sib => .pseudo_set_z_and_np_m_sib, + .rip => .pseudo_set_z_and_np_m_rip, + else => unreachable, + }, + .nz_or_p => switch (m) { + .sib => .pseudo_set_nz_or_p_m_sib, + .rip => .pseudo_set_nz_or_p_m_rip, + else => unreachable, + }, + }, + .data = switch (cc) { + else => .{ .x = .{ + .fixes = Mir.Inst.Fixes.fromCondition(cc), + .payload = payload, + } }, + .z_and_np, .nz_or_p => .{ .x_scratch = .{ + .scratch_reg = (try self.register_manager.allocReg(null, gp)).to8(), + .payload = payload, + } }, + }, }); } fn asmJmpReloc(self: *Self, target: Mir.Inst.Index) !Mir.Inst.Index { return self.addInst(.{ - .tag = .jmp_reloc, - .ops = undefined, - .data = .{ .inst = target }, + .tag = .jmp, + .ops = .inst, + .data = .{ .inst = .{ + .inst = target, + } }, }); } fn asmJccReloc(self: *Self, target: Mir.Inst.Index, cc: bits.Condition) !Mir.Inst.Index { return self.addInst(.{ - .tag = .jcc, - .ops = .inst_cc, - .data = .{ .inst_cc = .{ .inst = target, .cc = cc } }, + .tag = switch (cc) { + else => .j, + .z_and_np, .nz_or_p => .pseudo, + }, + .ops = switch (cc) { + else => .inst, + .z_and_np => .pseudo_j_z_and_np_inst, + .nz_or_p => .pseudo_j_nz_or_p_inst, + }, + .data = .{ .inst = .{ + .fixes = switch (cc) { + else => Mir.Inst.Fixes.fromCondition(cc), + .z_and_np, .nz_or_p => ._, + }, + .inst = target, + } }, }); } fn asmPlaceholder(self: *Self) !Mir.Inst.Index { return self.addInst(.{ - .tag = .dead, - .ops = undefined, + .tag = .pseudo, + .ops = .pseudo_dead_none, .data = undefined, }); } @@ -1107,11 +1175,19 @@ fn asmOpOnly(self: *Self, tag: Mir.Inst.Tag) !void { }); } +fn asmPseudo(self: *Self, ops: Mir.Inst.Ops) !void { + _ = try self.addInst(.{ + .tag = .pseudo, + .ops = ops, + .data = undefined, + }); +} + fn asmRegister(self: *Self, tag: Mir.Inst.Tag, reg: Register) !void { _ = try self.addInst(.{ .tag = tag, .ops = .r, - .data = .{ .r = reg }, + .data = .{ .r = .{ .r1 = reg } }, }); } @@ -1122,9 +1198,11 @@ fn asmImmediate(self: *Self, tag: Mir.Inst.Tag, imm: Immediate) !void { .signed => .i_s, .unsigned => .i_u, }, - .data = .{ .i = switch (imm) { - .signed => |s| @bitCast(u32, s), - .unsigned => |u| @intCast(u32, u), + .data = .{ .i = .{ + .i = switch (imm) { + .signed => |s| @bitCast(u32, s), + .unsigned => |u| @intCast(u32, u), + }, } }, }); } @@ -1147,14 +1225,14 @@ fn asmRegisterImmediate(self: *Self, tag: Mir.Inst.Tag, reg: Register, imm: Imme .ops = ops, .data = switch (ops) { .ri_s, .ri_u => .{ .ri = .{ - .r = reg, + .r1 = reg, .i = switch (imm) { .signed => |s| @bitCast(u32, s), .unsigned => |u| @intCast(u32, u), }, } }, .ri64 => .{ .rx = .{ - .r = reg, + .r1 = reg, .payload = try self.addExtra(Mir.Imm64.encode(imm.unsigned)), } }, else => unreachable, @@ -1249,10 +1327,12 @@ fn asmMemory(self: *Self, tag: Mir.Inst.Tag, m: Memory) !void { .rip => .m_rip, else => unreachable, }, - .data = .{ .payload = switch (m) { - .sib => try self.addExtra(Mir.MemorySib.encode(m)), - .rip => try self.addExtra(Mir.MemoryRip.encode(m)), - else => unreachable, + .data = .{ .x = .{ + .payload = switch (m) { + .sib => try self.addExtra(Mir.MemorySib.encode(m)), + .rip => try self.addExtra(Mir.MemoryRip.encode(m)), + else => unreachable, + }, } }, }); } @@ -1266,7 +1346,7 @@ fn asmRegisterMemory(self: *Self, tag: Mir.Inst.Tag, reg: Register, m: Memory) ! else => unreachable, }, .data = .{ .rx = .{ - .r = reg, + .r1 = reg, .payload = switch (m) { .sib => try self.addExtra(Mir.MemorySib.encode(m)), .rip => try self.addExtra(Mir.MemoryRip.encode(m)), @@ -1291,7 +1371,7 @@ fn asmRegisterMemoryImmediate( else => unreachable, }, .data = .{ .rix = .{ - .r = reg, + .r1 = reg, .i = @intCast(u8, imm.unsigned), .payload = switch (m) { .sib => try self.addExtra(Mir.MemorySib.encode(m)), @@ -1339,7 +1419,7 @@ fn asmMemoryRegister(self: *Self, tag: Mir.Inst.Tag, m: Memory, reg: Register) ! else => unreachable, }, .data = .{ .rx = .{ - .r = reg, + .r1 = reg, .payload = switch (m) { .sib => try self.addExtra(Mir.MemorySib.encode(m)), .rip => try self.addExtra(Mir.MemoryRip.encode(m)), @@ -1413,11 +1493,15 @@ fn asmMemoryRegisterImmediate( .rip => .mri_rip, else => unreachable, }, - .data = .{ .rix = .{ .r = reg, .i = @intCast(u8, imm.unsigned), .payload = switch (m) { - .sib => try self.addExtra(Mir.MemorySib.encode(m)), - .rip => try self.addExtra(Mir.MemoryRip.encode(m)), - else => unreachable, - } } }, + .data = .{ .rix = .{ + .r1 = reg, + .i = @intCast(u8, imm.unsigned), + .payload = switch (m) { + .sib => try self.addExtra(Mir.MemorySib.encode(m)), + .rip => try self.addExtra(Mir.MemoryRip.encode(m)), + else => unreachable, + }, + } }, }); } @@ -1450,7 +1534,7 @@ fn gen(self: *Self) InnerError!void { else => unreachable, } - try self.asmOpOnly(.dbg_prologue_end); + try self.asmPseudo(.pseudo_dbg_prologue_end_none); try self.genBody(self.air.getMainBody()); @@ -1462,11 +1546,11 @@ fn gen(self: *Self) InnerError!void { // } // Eliding the reloc will cause a miscompilation in this case. for (self.exitlude_jump_relocs.items) |jmp_reloc| { - self.mir_instructions.items(.data)[jmp_reloc].inst = + self.mir_instructions.items(.data)[jmp_reloc].inst.inst = @intCast(u32, self.mir_instructions.len); } - try self.asmOpOnly(.dbg_epilogue_begin); + try self.asmPseudo(.pseudo_dbg_epilogue_begin_none); const backpatch_stack_dealloc = try self.asmPlaceholder(); const backpatch_pop_callee_preserved_regs = try self.asmPlaceholder(); try self.asmRegister(.pop, .rbp); @@ -1480,46 +1564,54 @@ fn gen(self: *Self) InnerError!void { self.mir_instructions.set(backpatch_frame_align, .{ .tag = .@"and", .ops = .ri_s, - .data = .{ .ri = .{ .r = .rsp, .i = frame_layout.stack_mask } }, + .data = .{ .ri = .{ + .r1 = .rsp, + .i = frame_layout.stack_mask, + } }, }); } if (need_stack_adjust) { self.mir_instructions.set(backpatch_stack_alloc, .{ .tag = .sub, .ops = .ri_s, - .data = .{ .ri = .{ .r = .rsp, .i = frame_layout.stack_adjust } }, + .data = .{ .ri = .{ + .r1 = .rsp, + .i = frame_layout.stack_adjust, + } }, }); } if (need_frame_align or need_stack_adjust) { self.mir_instructions.set(backpatch_stack_dealloc, .{ .tag = .mov, .ops = .rr, - .data = .{ .rr = .{ .r1 = .rsp, .r2 = .rbp } }, + .data = .{ .rr = .{ + .r1 = .rsp, + .r2 = .rbp, + } }, }); } if (need_save_reg) { - const save_reg_list = frame_layout.save_reg_list.asInt(); self.mir_instructions.set(backpatch_push_callee_preserved_regs, .{ - .tag = .push_regs, - .ops = undefined, - .data = .{ .payload = save_reg_list }, + .tag = .pseudo, + .ops = .pseudo_push_reg_list, + .data = .{ .reg_list = frame_layout.save_reg_list }, }); self.mir_instructions.set(backpatch_pop_callee_preserved_regs, .{ - .tag = .pop_regs, - .ops = undefined, - .data = .{ .payload = save_reg_list }, + .tag = .pseudo, + .ops = .pseudo_pop_reg_list, + .data = .{ .reg_list = frame_layout.save_reg_list }, }); } } else { - try self.asmOpOnly(.dbg_prologue_end); + try self.asmPseudo(.pseudo_dbg_prologue_end_none); try self.genBody(self.air.getMainBody()); - try self.asmOpOnly(.dbg_epilogue_begin); + try self.asmPseudo(.pseudo_dbg_epilogue_begin_none); } // Drop them off at the rbrace. _ = try self.addInst(.{ - .tag = .dbg_line, - .ops = undefined, + .tag = .pseudo, + .ops = .pseudo_dbg_line_line_column, .data = .{ .line_column = .{ .line = self.end_di_line, .column = self.end_di_column, @@ -2446,11 +2538,11 @@ fn airIntCast(self: *Self, inst: Air.Inst.Index) !void { .register => |dst_reg| { const min_abi_size = @min(dst_abi_size, src_abi_size); const tag: Mir.Inst.Tag = switch (signedness) { - .signed => .movsx, - .unsigned => if (min_abi_size > 2) .mov else .movzx, + .signed => if (min_abi_size >= 4) .movsxd else .movsx, + .unsigned => if (min_abi_size >= 4) .mov else .movzx, }; const dst_alias = switch (tag) { - .movsx => dst_reg.to64(), + .movsx, .movsxd => dst_reg.to64(), .mov, .movzx => if (min_abi_size > 4) dst_reg.to64() else dst_reg.to32(), else => unreachable, }; @@ -5247,7 +5339,7 @@ fn airStructFieldVal(self: *Self, inst: Air.Inst.Index) !void { const field_byte_size = @intCast(u32, field_ty.abiSize(self.target.*)); if (signedness == .signed and field_byte_size < 8) { try self.asmRegisterRegister( - .movsx, + if (field_byte_size >= 4) .movsxd else .movsx, dst_mcv.register, registerAlias(dst_mcv.register, field_byte_size), ); @@ -7194,10 +7286,10 @@ fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier const atom_index = try self.owner.getSymbolIndex(self); const sym_index = try coff_file.getGlobalSymbol(decl_name, lib_name); _ = try self.addInst(.{ - .tag = .mov_linker, + .tag = .mov, .ops = .import_reloc, .data = .{ .rx = .{ - .r = .rax, + .r1 = .rax, .payload = try self.addExtra(Mir.Reloc{ .atom_index = atom_index, .sym_index = sym_index, @@ -7209,9 +7301,9 @@ fn airCall(self: *Self, inst: Air.Inst.Index, modifier: std.builtin.CallModifier const atom_index = try self.owner.getSymbolIndex(self); const sym_index = try macho_file.getGlobalSymbol(decl_name, lib_name); _ = try self.addInst(.{ - .tag = .call_extern, - .ops = undefined, - .data = .{ .relocation = .{ + .tag = .call, + .ops = .extern_fn_reloc, + .data = .{ .reloc = .{ .atom_index = atom_index, .sym_index = sym_index, } }, @@ -7489,8 +7581,8 @@ fn genTry( fn airDbgStmt(self: *Self, inst: Air.Inst.Index) !void { const dbg_stmt = self.air.instructions.items(.data)[inst].dbg_stmt; _ = try self.addInst(.{ - .tag = .dbg_line, - .ops = undefined, + .tag = .pseudo, + .ops = .pseudo_dbg_line_line_column, .data = .{ .line_column = .{ .line = dbg_stmt.line, .column = dbg_stmt.column, @@ -8021,14 +8113,14 @@ fn airSwitchBr(self: *Self, inst: Air.Inst.Index) !void { fn performReloc(self: *Self, reloc: Mir.Inst.Index) !void { const next_inst = @intCast(u32, self.mir_instructions.len); switch (self.mir_instructions.items(.tag)[reloc]) { - .jcc => { - self.mir_instructions.items(.data)[reloc].inst_cc.inst = next_inst; - }, - .jmp_reloc => { - self.mir_instructions.items(.data)[reloc].inst = next_inst; + .j, .jmp => {}, + .pseudo => switch (self.mir_instructions.items(.ops)[reloc]) { + .pseudo_j_z_and_np_inst, .pseudo_j_nz_or_p_inst => {}, + else => unreachable, }, else => unreachable, } + self.mir_instructions.items(.data)[reloc].inst.inst = next_inst; } fn airBr(self: *Self, inst: Air.Inst.Index) !void { @@ -8577,10 +8669,10 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr .load_direct => |sym_index| if (!ty.isRuntimeFloat()) { const atom_index = try self.owner.getSymbolIndex(self); _ = try self.addInst(.{ - .tag = .mov_linker, + .tag = .mov, .ops = .direct_reloc, .data = .{ .rx = .{ - .r = dst_reg.to64(), + .r1 = dst_reg.to64(), .payload = try self.addExtra(Mir.Reloc{ .atom_index = atom_index, .sym_index = sym_index, @@ -8618,8 +8710,8 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr const atom_index = try self.owner.getSymbolIndex(self); _ = try self.addInst(.{ .tag = switch (src_mcv) { - .lea_direct => .lea_linker, - .lea_got => .mov_linker, + .lea_direct => .lea, + .lea_got => .mov, else => unreachable, }, .ops = switch (src_mcv) { @@ -8628,7 +8720,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr else => unreachable, }, .data = .{ .rx = .{ - .r = dst_reg.to64(), + .r1 = dst_reg.to64(), .payload = try self.addExtra(Mir.Reloc{ .atom_index = atom_index, .sym_index = sym_index, @@ -8640,10 +8732,10 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr const atom_index = try self.owner.getSymbolIndex(self); if (self.bin_file.cast(link.File.MachO)) |_| { _ = try self.addInst(.{ - .tag = .lea_linker, + .tag = .lea, .ops = .tlv_reloc, .data = .{ .rx = .{ - .r = .rdi, + .r1 = .rdi, .payload = try self.addExtra(Mir.Reloc{ .atom_index = atom_index, .sym_index = sym_index, @@ -8847,9 +8939,9 @@ fn genInlineMemcpy(self: *Self, dst_ptr: MCValue, src_ptr: MCValue, len: MCValue try self.genSetReg(.rsi, Type.usize, src_ptr); try self.genSetReg(.rcx, Type.usize, len); _ = try self.addInst(.{ - .tag = .movs, - .ops = .string, - .data = .{ .string = .{ .repeat = .rep, .width = .b } }, + .tag = .mov, + .ops = .none, + .data = .{ .none = .{ .fixes = .@"rep _sb" } }, }); } @@ -8859,9 +8951,9 @@ fn genInlineMemset(self: *Self, dst_ptr: MCValue, value: MCValue, len: MCValue) try self.genSetReg(.al, Type.u8, value); try self.genSetReg(.rcx, Type.usize, len); _ = try self.addInst(.{ - .tag = .stos, - .ops = .string, - .data = .{ .string = .{ .repeat = .rep, .width = .b } }, + .tag = .sto, + .ops = .none, + .data = .{ .none = .{ .fixes = .@"rep _sb" } }, }); } @@ -9135,22 +9227,22 @@ fn airCmpxchg(self: *Self, inst: Air.Inst.Index) !void { defer if (ptr_lock) |lock| self.register_manager.unlockReg(lock); try self.spillEflagsIfOccupied(); - if (val_abi_size <= 8) { - _ = try self.addInst(.{ - .tag = .cmpxchg, - .ops = .lock_mr_sib, - .data = .{ .rx = .{ - .r = registerAlias(new_reg.?, val_abi_size), - .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), - } }, - }); - } else { - _ = try self.addInst(.{ - .tag = .cmpxchgb, - .ops = .lock_m_sib, - .data = .{ .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)) }, - }); - } + _ = try self.addInst(if (val_abi_size <= 8) .{ + .tag = .cmpxchg, + .ops = .mr_sib, + .data = .{ .rx = .{ + .fixes = .@"lock _", + .r1 = registerAlias(new_reg.?, val_abi_size), + .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), + } }, + } else .{ + .tag = .cmpxchg, + .ops = .m_sib, + .data = .{ .x = .{ + .fixes = .@"lock _16b", + .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), + } }, + }); const result: MCValue = result: { if (self.liveness.isUnused(inst)) break :result .unreach; @@ -9252,13 +9344,14 @@ fn atomicOp( } _ = try self.addInst(.{ .tag = tag, - .ops = switch (tag) { - .mov, .xchg => .mr_sib, - .xadd, .add, .sub, .@"and", .@"or", .xor => .lock_mr_sib, - else => unreachable, - }, + .ops = .mr_sib, .data = .{ .rx = .{ - .r = registerAlias(dst_reg, val_abi_size), + .fixes = switch (tag) { + .mov, .xchg => ._, + .xadd, .add, .sub, .@"and", .@"or", .xor => .@"lock _", + else => unreachable, + }, + .r1 = registerAlias(dst_reg, val_abi_size), .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), } }, }); @@ -9330,9 +9423,10 @@ fn atomicOp( }; _ = try self.addInst(.{ .tag = .cmpxchg, - .ops = .lock_mr_sib, + .ops = .mr_sib, .data = .{ .rx = .{ - .r = registerAlias(tmp_reg, val_abi_size), + .fixes = .@"lock _", + .r1 = registerAlias(tmp_reg, val_abi_size), .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), } }, }); @@ -9397,9 +9491,14 @@ fn atomicOp( val_ty.fmt(self.bin_file.options.module.?), @tagName(op), }), }; - _ = try self.addInst(.{ .tag = .cmpxchgb, .ops = .lock_m_sib, .data = .{ - .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), - } }); + _ = try self.addInst(.{ + .tag = .cmpxchg, + .ops = .m_sib, + .data = .{ .x = .{ + .fixes = .@"lock _16b", + .payload = try self.addExtra(Mir.MemorySib.encode(ptr_mem)), + } }, + }); _ = try self.asmJccReloc(loop, .ne); if (unused) return .unreach; diff --git a/src/arch/x86_64/Emit.zig b/src/arch/x86_64/Emit.zig index 3574d52878..506092ff17 100644 --- a/src/arch/x86_64/Emit.zig +++ b/src/arch/x86_64/Emit.zig @@ -41,7 +41,7 @@ pub fn emitMir(emit: *Emit) Error!void { .offset = end_offset - 4, .length = @intCast(u5, end_offset - start_offset), }), - .@"extern" => |symbol| if (emit.bin_file.cast(link.File.MachO)) |macho_file| { + .linker_extern_fn => |symbol| if (emit.bin_file.cast(link.File.MachO)) |macho_file| { // Add relocation to the decl. const atom_index = macho_file.getAtomIndexForSymbol( .{ .sym_index = symbol.atom_index, .file = null }, @@ -129,36 +129,39 @@ pub fn emitMir(emit: *Emit) Error!void { const mir_inst = emit.lower.mir.instructions.get(mir_index); switch (mir_inst.tag) { else => unreachable, - .dead => {}, - .dbg_line => try emit.dbgAdvancePCAndLine( - mir_inst.data.line_column.line, - mir_inst.data.line_column.column, - ), - .dbg_prologue_end => { - switch (emit.debug_output) { - .dwarf => |dw| { - try dw.setPrologueEnd(); - log.debug("mirDbgPrologueEnd (line={d}, col={d})", .{ - emit.prev_di_line, emit.prev_di_column, - }); - try emit.dbgAdvancePCAndLine(emit.prev_di_line, emit.prev_di_column); - }, - .plan9 => {}, - .none => {}, - } - }, - .dbg_epilogue_begin => { - switch (emit.debug_output) { - .dwarf => |dw| { - try dw.setEpilogueBegin(); - log.debug("mirDbgEpilogueBegin (line={d}, col={d})", .{ - emit.prev_di_line, emit.prev_di_column, - }); - try emit.dbgAdvancePCAndLine(emit.prev_di_line, emit.prev_di_column); - }, - .plan9 => {}, - .none => {}, - } + .pseudo => switch (mir_inst.ops) { + else => unreachable, + .pseudo_dbg_prologue_end_none => { + switch (emit.debug_output) { + .dwarf => |dw| { + try dw.setPrologueEnd(); + log.debug("mirDbgPrologueEnd (line={d}, col={d})", .{ + emit.prev_di_line, emit.prev_di_column, + }); + try emit.dbgAdvancePCAndLine(emit.prev_di_line, emit.prev_di_column); + }, + .plan9 => {}, + .none => {}, + } + }, + .pseudo_dbg_line_line_column => try emit.dbgAdvancePCAndLine( + mir_inst.data.line_column.line, + mir_inst.data.line_column.column, + ), + .pseudo_dbg_epilogue_begin_none => { + switch (emit.debug_output) { + .dwarf => |dw| { + try dw.setEpilogueBegin(); + log.debug("mirDbgEpilogueBegin (line={d}, col={d})", .{ + emit.prev_di_line, emit.prev_di_column, + }); + try emit.dbgAdvancePCAndLine(emit.prev_di_line, emit.prev_di_column); + }, + .plan9 => {}, + .none => {}, + } + }, + .pseudo_dead_none => {}, }, } } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 3235b29358..b6b49e8939 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -705,7 +705,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op } const mnemonic_to_encodings_map = init: { - @setEvalBranchQuota(100_000); + @setEvalBranchQuota(20_000); const encodings = @import("encodings.zig"); var entries = encodings.table; std.sort.sort(encodings.Entry, &entries, {}, struct { diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 5c079f4768..2d7fa4b4fd 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -35,7 +35,7 @@ pub const Reloc = struct { const Target = union(enum) { inst: Mir.Inst.Index, - @"extern": Mir.Reloc, + linker_extern_fn: Mir.Reloc, linker_got: Mir.Reloc, linker_direct: Mir.Reloc, linker_import: Mir.Reloc, @@ -59,280 +59,119 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { const inst = lower.mir.instructions.get(index); switch (inst.tag) { - .adc, - .add, - .@"and", - .bsf, - .bsr, - .bswap, - .bt, - .btc, - .btr, - .bts, - .call, - .cbw, - .cwde, - .cdqe, - .cwd, - .cdq, - .cqo, - .cmp, - .cmpxchg, - .div, - .fisttp, - .fld, - .idiv, - .imul, - .int3, - .jmp, - .lea, - .lfence, - .lzcnt, - .mfence, - .mov, - .movbe, - .movd, - .movq, - .movzx, - .mul, - .neg, - .nop, - .not, - .@"or", - .pop, - .popcnt, - .push, - .rcl, - .rcr, - .ret, - .rol, - .ror, - .sal, - .sar, - .sbb, - .sfence, - .shl, - .shld, - .shr, - .shrd, - .sub, - .syscall, - .@"test", - .tzcnt, - .ud2, - .xadd, - .xchg, - .xor, - - .addps, - .addss, - .andnps, - .andps, - .cmpss, - .cvtsi2ss, - .divps, - .divss, - .maxps, - .maxss, - .minps, - .minss, - .movaps, - .movhlps, - .movss, - .movups, - .mulps, - .mulss, - .orps, - .pextrw, - .pinsrw, - .sqrtps, - .sqrtss, - .subps, - .subss, - .ucomiss, - .unpckhps, - .unpcklps, - .xorps, - - .addpd, - .addsd, - .andnpd, - .andpd, - .cmpsd, - .cvtsd2ss, - .cvtsi2sd, - .cvtss2sd, - .divpd, - .divsd, - .maxpd, - .maxsd, - .minpd, - .minsd, - .movsd, - .mulpd, - .mulsd, - .orpd, - .pshufhw, - .pshuflw, - .psrld, - .psrlq, - .psrlw, - .punpckhbw, - .punpckhdq, - .punpckhqdq, - .punpckhwd, - .punpcklbw, - .punpckldq, - .punpcklqdq, - .punpcklwd, - .sqrtpd, - .sqrtsd, - .subpd, - .subsd, - .ucomisd, - .unpckhpd, - .unpcklpd, - .xorpd, - - .movddup, - .movshdup, - .movsldup, - - .pextrb, - .pextrd, - .pextrq, - .pinsrb, - .pinsrd, - .pinsrq, - .roundpd, - .roundps, - .roundsd, - .roundss, - - .vaddpd, - .vaddps, - .vaddsd, - .vaddss, - .vcvtsd2ss, - .vcvtsi2sd, - .vcvtsi2ss, - .vcvtss2sd, - .vdivpd, - .vdivps, - .vdivsd, - .vdivss, - .vmaxpd, - .vmaxps, - .vmaxsd, - .vmaxss, - .vminpd, - .vminps, - .vminsd, - .vminss, - .vmovapd, - .vmovaps, - .vmovddup, - .vmovhlps, - .vmovsd, - .vmovshdup, - .vmovsldup, - .vmovss, - .vmovupd, - .vmovups, - .vmulpd, - .vmulps, - .vmulsd, - .vmulss, - .vpextrb, - .vpextrd, - .vpextrq, - .vpextrw, - .vpinsrb, - .vpinsrd, - .vpinsrq, - .vpinsrw, - .vpshufhw, - .vpshuflw, - .vpsrld, - .vpsrlq, - .vpsrlw, - .vpunpckhbw, - .vpunpckhdq, - .vpunpckhqdq, - .vpunpckhwd, - .vpunpcklbw, - .vpunpckldq, - .vpunpcklqdq, - .vpunpcklwd, - .vroundpd, - .vroundps, - .vroundsd, - .vroundss, - .vsqrtpd, - .vsqrtps, - .vsqrtsd, - .vsqrtss, - .vsubpd, - .vsubps, - .vsubsd, - .vsubss, - .vunpckhpd, - .vunpckhps, - .vunpcklpd, - .vunpcklps, - - .vcvtph2ps, - .vcvtps2ph, - - .vfmadd132pd, - .vfmadd213pd, - .vfmadd231pd, - .vfmadd132ps, - .vfmadd213ps, - .vfmadd231ps, - .vfmadd132sd, - .vfmadd213sd, - .vfmadd231sd, - .vfmadd132ss, - .vfmadd213ss, - .vfmadd231ss, - => try lower.mirGeneric(inst), - - .cmps, - .lods, - .movs, - .scas, - .stos, - => try lower.mirString(inst), - - .cmpxchgb => try lower.mirCmpxchgBytes(inst), - - .jmp_reloc => try lower.emitInstWithReloc(.none, .jmp, &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .inst = inst.data.inst }), - - .call_extern => try lower.emitInstWithReloc(.none, .call, &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .@"extern" = inst.data.relocation }), - - .lea_linker => try lower.mirLinker(.lea, inst), - .mov_linker => try lower.mirLinker(.mov, inst), - - .mov_moffs => try lower.mirMovMoffs(inst), - - .movsx => try lower.mirMovsx(inst), - .cmovcc => try lower.mirCmovcc(inst), - .setcc => try lower.mirSetcc(inst), - .jcc => try lower.mirJcc(index, inst), + else => try lower.generic(inst), + .pseudo => switch (inst.ops) { + .pseudo_cmov_z_and_np_rr => { + try lower.emit(.none, .cmovnz, &.{ + .{ .reg = inst.data.rr.r2 }, + .{ .reg = inst.data.rr.r1 }, + }); + try lower.emit(.none, .cmovnp, &.{ + .{ .reg = inst.data.rr.r1 }, + .{ .reg = inst.data.rr.r2 }, + }); + }, + .pseudo_cmov_nz_or_p_rr => { + try lower.emit(.none, .cmovnz, &.{ + .{ .reg = inst.data.rr.r1 }, + .{ .reg = inst.data.rr.r2 }, + }); + try lower.emit(.none, .cmovp, &.{ + .{ .reg = inst.data.rr.r1 }, + .{ .reg = inst.data.rr.r2 }, + }); + }, + .pseudo_cmov_nz_or_p_rm_sib, + .pseudo_cmov_nz_or_p_rm_rip, + => { + try lower.emit(.none, .cmovnz, &.{ + .{ .reg = inst.data.rx.r1 }, + .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, + }); + try lower.emit(.none, .cmovp, &.{ + .{ .reg = inst.data.rx.r1 }, + .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, + }); + }, + .pseudo_set_z_and_np_r => { + try lower.emit(.none, .setz, &.{ + .{ .reg = inst.data.r_scratch.r1 }, + }); + try lower.emit(.none, .setnp, &.{ + .{ .reg = inst.data.r_scratch.scratch_reg }, + }); + try lower.emit(.none, .@"and", &.{ + .{ .reg = inst.data.r_scratch.r1 }, + .{ .reg = inst.data.r_scratch.scratch_reg }, + }); + }, + .pseudo_set_z_and_np_m_sib, + .pseudo_set_z_and_np_m_rip, + => { + try lower.emit(.none, .setz, &.{ + .{ .mem = lower.mem(inst.ops, inst.data.x_scratch.payload) }, + }); + try lower.emit(.none, .setnp, &.{ + .{ .reg = inst.data.x_scratch.scratch_reg }, + }); + try lower.emit(.none, .@"and", &.{ + .{ .mem = lower.mem(inst.ops, inst.data.x_scratch.payload) }, + .{ .reg = inst.data.x_scratch.scratch_reg }, + }); + }, + .pseudo_set_nz_or_p_r => { + try lower.emit(.none, .setnz, &.{ + .{ .reg = inst.data.r_scratch.r1 }, + }); + try lower.emit(.none, .setp, &.{ + .{ .reg = inst.data.r_scratch.scratch_reg }, + }); + try lower.emit(.none, .@"or", &.{ + .{ .reg = inst.data.r_scratch.r1 }, + .{ .reg = inst.data.r_scratch.scratch_reg }, + }); + }, + .pseudo_set_nz_or_p_m_sib, + .pseudo_set_nz_or_p_m_rip, + => { + try lower.emit(.none, .setnz, &.{ + .{ .mem = lower.mem(inst.ops, inst.data.x_scratch.payload) }, + }); + try lower.emit(.none, .setp, &.{ + .{ .reg = inst.data.x_scratch.scratch_reg }, + }); + try lower.emit(.none, .@"or", &.{ + .{ .mem = lower.mem(inst.ops, inst.data.x_scratch.payload) }, + .{ .reg = inst.data.x_scratch.scratch_reg }, + }); + }, + .pseudo_j_z_and_np_inst => { + try lower.emit(.none, .jnz, &.{ + .{ .imm = lower.reloc(.{ .inst = index + 1 }) }, + }); + try lower.emit(.none, .jnp, &.{ + .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }) }, + }); + }, + .pseudo_j_nz_or_p_inst => { + try lower.emit(.none, .jnz, &.{ + .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }) }, + }); + try lower.emit(.none, .jp, &.{ + .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }) }, + }); + }, - .push_regs => try lower.mirRegisterList(.push, inst), - .pop_regs => try lower.mirRegisterList(.pop, inst), + .pseudo_push_reg_list => try lower.pushPopRegList(.push, inst), + .pseudo_pop_reg_list => try lower.pushPopRegList(.pop, inst), - .dbg_line, - .dbg_prologue_end, - .dbg_epilogue_begin, - .dead, - => {}, + .pseudo_dbg_prologue_end_none, + .pseudo_dbg_line_line_column, + .pseudo_dbg_epilogue_begin_none, + .pseudo_dead_none, + => {}, + else => unreachable, + }, } return .{ @@ -348,15 +187,6 @@ pub fn fail(lower: *Lower, comptime format: []const u8, args: anytype) Error { return error.LowerFail; } -fn mnem_cc(comptime base: @Type(.EnumLiteral), cc: bits.Condition) Mnemonic { - return switch (cc) { - inline else => |c| if (@hasField(Mnemonic, @tagName(base) ++ @tagName(c))) - @field(Mnemonic, @tagName(base) ++ @tagName(c)) - else - unreachable, - }; -} - fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { return switch (ops) { .rri_s, @@ -364,8 +194,6 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { .i_s, .mi_sib_s, .mi_rip_s, - .lock_mi_sib_s, - .lock_mi_rip_s, => Immediate.s(@bitCast(i32, i)), .rrri, @@ -374,8 +202,6 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { .i_u, .mi_sib_u, .mi_rip_u, - .lock_mi_sib_u, - .lock_mi_rip_u, .rmi_sib, .rmi_rip, .mri_sib, @@ -395,10 +221,8 @@ fn imm(lower: Lower, ops: Mir.Inst.Ops, i: u32) Immediate { fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { return lower.mir.resolveFrameLoc(switch (ops) { .rm_sib, - .rm_sib_cc, .rmi_sib, .m_sib, - .m_sib_cc, .mi_sib_u, .mi_sib_s, .mr_sib, @@ -406,17 +230,15 @@ fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { .mri_sib, .rrm_sib, .rrmi_sib, - .lock_m_sib, - .lock_mi_sib_u, - .lock_mi_sib_s, - .lock_mr_sib, + + .pseudo_cmov_nz_or_p_rm_sib, + .pseudo_set_z_and_np_m_sib, + .pseudo_set_nz_or_p_m_sib, => lower.mir.extraData(Mir.MemorySib, payload).data.decode(), .rm_rip, - .rm_rip_cc, .rmi_rip, .m_rip, - .m_rip_cc, .mi_rip_u, .mi_rip_s, .mr_rip, @@ -424,66 +246,83 @@ fn mem(lower: Lower, ops: Mir.Inst.Ops, payload: u32) Memory { .mri_rip, .rrm_rip, .rrmi_rip, - .lock_m_rip, - .lock_mi_rip_u, - .lock_mi_rip_s, - .lock_mr_rip, + + .pseudo_cmov_nz_or_p_rm_rip, + .pseudo_set_z_and_np_m_rip, + .pseudo_set_nz_or_p_m_rip, => lower.mir.extraData(Mir.MemoryRip, payload).data.decode(), .rax_moffs, .moffs_rax, - .lock_moffs_rax, => lower.mir.extraData(Mir.MemoryMoffs, payload).data.decode(), else => unreachable, }); } -fn emitInst(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand) Error!void { - lower.result_insts[lower.result_insts_len] = try Instruction.new(prefix, mnemonic, ops); - lower.result_insts_len += 1; -} - -fn emitInstWithReloc( - lower: *Lower, - prefix: Prefix, - mnemonic: Mnemonic, - ops: []const Operand, - target: Reloc.Target, -) Error!void { +fn reloc(lower: *Lower, target: Reloc.Target) Immediate { lower.result_relocs[lower.result_relocs_len] = .{ .lowered_inst_index = lower.result_insts_len, .target = target, }; lower.result_relocs_len += 1; - try lower.emitInst(prefix, mnemonic, ops); + return Immediate.s(0); +} + +fn emit(lower: *Lower, prefix: Prefix, mnemonic: Mnemonic, ops: []const Operand) Error!void { + lower.result_insts[lower.result_insts_len] = try Instruction.new(prefix, mnemonic, ops); + lower.result_insts_len += 1; } -fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { - try lower.emitInst(switch (inst.ops) { - else => .none, - .lock_m_sib, - .lock_m_rip, - .lock_mi_sib_u, - .lock_mi_rip_u, - .lock_mi_sib_s, - .lock_mi_rip_s, - .lock_mr_sib, - .lock_mr_rip, - .lock_moffs_rax, - => .lock, - }, switch (inst.tag) { - inline else => |tag| if (@hasField(Mnemonic, @tagName(tag))) - @field(Mnemonic, @tagName(tag)) +fn generic(lower: *Lower, inst: Mir.Inst) Error!void { + const fixes = switch (inst.ops) { + .none => inst.data.none.fixes, + .inst => inst.data.inst.fixes, + .i_s, .i_u => inst.data.i.fixes, + .r => inst.data.r.fixes, + .rr => inst.data.rr.fixes, + .rrr => inst.data.rrr.fixes, + .rrri => inst.data.rrri.fixes, + .rri_s, .rri_u => inst.data.rri.fixes, + .ri_s, .ri_u => inst.data.ri.fixes, + .ri64, .rm_sib, .rm_rip, .mr_sib, .mr_rip => inst.data.rx.fixes, + .mi_sib_u, .mi_rip_u, .mi_sib_s, .mi_rip_s => ._, + .mrr_sib, .mrr_rip, .rrm_sib, .rrm_rip => inst.data.rrx.fixes, + .rmi_sib, .rmi_rip, .mri_sib, .mri_rip => inst.data.rix.fixes, + .rrmi_sib, .rrmi_rip => inst.data.rrix.fixes, + .m_sib, .m_rip, .rax_moffs, .moffs_rax => inst.data.x.fixes, + .extern_fn_reloc, .got_reloc, .direct_reloc, .import_reloc, .tlv_reloc => ._, + else => return lower.fail("TODO lower .{s}", .{@tagName(inst.ops)}), + }; + try lower.emit(switch (fixes) { + inline else => |tag| comptime if (std.mem.indexOfScalar(u8, @tagName(tag), ' ')) |space| + @field(Prefix, @tagName(tag)[0..space]) else - unreachable, + .none, + }, mnemonic: { + comptime var max_len = 0; + inline for (@typeInfo(Mnemonic).Enum.fields) |field| max_len = @max(field.name.len, max_len); + var buf: [max_len]u8 = undefined; + + const fixes_name = @tagName(fixes); + const pattern = fixes_name[if (std.mem.indexOfScalar(u8, fixes_name, ' ')) |i| i + 1 else 0..]; + const wildcard_i = std.mem.indexOfScalar(u8, pattern, '_').?; + const parts = .{ pattern[0..wildcard_i], @tagName(inst.tag), pattern[wildcard_i + 1 ..] }; + const err_msg = "unsupported mnemonic: "; + const mnemonic = std.fmt.bufPrint(&buf, "{s}{s}{s}", parts) catch + return lower.fail(err_msg ++ "'{s}{s}{s}'", parts); + break :mnemonic std.meta.stringToEnum(Mnemonic, mnemonic) orelse + return lower.fail(err_msg ++ "'{s}'", .{mnemonic}); }, switch (inst.ops) { .none => &.{}, + .inst => &.{ + .{ .imm = lower.reloc(.{ .inst = inst.data.inst.inst }) }, + }, .i_s, .i_u => &.{ - .{ .imm = lower.imm(inst.ops, inst.data.i) }, + .{ .imm = lower.imm(inst.ops, inst.data.i.i) }, }, .r => &.{ - .{ .reg = inst.data.r }, + .{ .reg = inst.data.r.r1 }, }, .rr => &.{ .{ .reg = inst.data.rr.r1 }, @@ -501,11 +340,11 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .imm = lower.imm(inst.ops, inst.data.rrri.i) }, }, .ri_s, .ri_u => &.{ - .{ .reg = inst.data.ri.r }, + .{ .reg = inst.data.ri.r1 }, .{ .imm = lower.imm(inst.ops, inst.data.ri.i) }, }, .ri64 => &.{ - .{ .reg = inst.data.rx.r }, + .{ .reg = inst.data.rx.r1 }, .{ .imm = lower.imm(inst.ops, inst.data.rx.payload) }, }, .rri_s, .rri_u => &.{ @@ -513,33 +352,25 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rri.r2 }, .{ .imm = lower.imm(inst.ops, inst.data.rri.i) }, }, - .m_sib, .lock_m_sib, .m_rip, .lock_m_rip => &.{ - .{ .mem = lower.mem(inst.ops, inst.data.payload) }, + .m_sib, .m_rip => &.{ + .{ .mem = lower.mem(inst.ops, inst.data.x.payload) }, }, - .mi_sib_s, - .lock_mi_sib_s, - .mi_sib_u, - .lock_mi_sib_u, - .mi_rip_u, - .lock_mi_rip_u, - .mi_rip_s, - .lock_mi_rip_s, - => &.{ + .mi_sib_s, .mi_sib_u, .mi_rip_u, .mi_rip_s => &.{ .{ .mem = lower.mem(inst.ops, inst.data.ix.payload) }, .{ .imm = lower.imm(inst.ops, inst.data.ix.i) }, }, .rm_sib, .rm_rip => &.{ - .{ .reg = inst.data.rx.r }, + .{ .reg = inst.data.rx.r1 }, .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, }, .rmi_sib, .rmi_rip => &.{ - .{ .reg = inst.data.rix.r }, + .{ .reg = inst.data.rix.r1 }, .{ .mem = lower.mem(inst.ops, inst.data.rix.payload) }, .{ .imm = lower.imm(inst.ops, inst.data.rix.i) }, }, - .mr_sib, .lock_mr_sib, .mr_rip, .lock_mr_rip => &.{ + .mr_sib, .mr_rip => &.{ .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, - .{ .reg = inst.data.rx.r }, + .{ .reg = inst.data.rx.r1 }, }, .mrr_sib, .mrr_rip => &.{ .{ .mem = lower.mem(inst.ops, inst.data.rrx.payload) }, @@ -548,7 +379,7 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { }, .mri_sib, .mri_rip => &.{ .{ .mem = lower.mem(inst.ops, inst.data.rix.payload) }, - .{ .reg = inst.data.rix.r }, + .{ .reg = inst.data.rix.r1 }, .{ .imm = lower.imm(inst.ops, inst.data.rix.i) }, }, .rrm_sib, .rrm_rip => &.{ @@ -562,180 +393,46 @@ fn mirGeneric(lower: *Lower, inst: Mir.Inst) Error!void { .{ .mem = lower.mem(inst.ops, inst.data.rrix.payload) }, .{ .imm = lower.imm(inst.ops, inst.data.rrix.i) }, }, - else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - }); -} - -fn mirString(lower: *Lower, inst: Mir.Inst) Error!void { - switch (inst.ops) { - .string => try lower.emitInst(switch (inst.data.string.repeat) { - inline else => |repeat| @field(Prefix, @tagName(repeat)), - }, switch (inst.tag) { - inline .cmps, .lods, .movs, .scas, .stos => |tag| switch (inst.data.string.width) { - inline else => |width| @field(Mnemonic, @tagName(tag) ++ @tagName(width)), - }, - else => unreachable, - }, &.{}), - else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - } -} - -fn mirCmpxchgBytes(lower: *Lower, inst: Mir.Inst) Error!void { - const ops: [1]Operand = switch (inst.ops) { - .m_sib, .lock_m_sib, .m_rip, .lock_m_rip => .{ - .{ .mem = lower.mem(inst.ops, inst.data.payload) }, - }, - else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - }; - try lower.emitInst(switch (inst.ops) { - .m_sib, .m_rip => .none, - .lock_m_sib, .lock_m_rip => .lock, - else => unreachable, - }, switch (@divExact(ops[0].bitSize(), 8)) { - 8 => .cmpxchg8b, - 16 => .cmpxchg16b, - else => return lower.fail("invalid operand for {s}", .{@tagName(inst.tag)}), - }, &ops); -} - -fn mirMovMoffs(lower: *Lower, inst: Mir.Inst) Error!void { - try lower.emitInst(switch (inst.ops) { - .rax_moffs, .moffs_rax => .none, - .lock_moffs_rax => .lock, - else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - }, .mov, switch (inst.ops) { .rax_moffs => &.{ .{ .reg = .rax }, - .{ .mem = lower.mem(inst.ops, inst.data.payload) }, + .{ .mem = lower.mem(inst.ops, inst.data.x.payload) }, }, - .moffs_rax, .lock_moffs_rax => &.{ - .{ .mem = lower.mem(inst.ops, inst.data.payload) }, + .moffs_rax => &.{ + .{ .mem = lower.mem(inst.ops, inst.data.x.payload) }, .{ .reg = .rax }, }, - else => unreachable, - }); -} - -fn mirMovsx(lower: *Lower, inst: Mir.Inst) Error!void { - const ops: [2]Operand = switch (inst.ops) { - .rr => .{ - .{ .reg = inst.data.rr.r1 }, - .{ .reg = inst.data.rr.r2 }, - }, - .rm_sib, .rm_rip => .{ - .{ .reg = inst.data.rx.r }, - .{ .mem = lower.mem(inst.ops, inst.data.rx.payload) }, - }, - else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - }; - try lower.emitInst(.none, switch (ops[0].bitSize()) { - 32, 64 => switch (ops[1].bitSize()) { - 32 => .movsxd, - else => .movsx, - }, - else => .movsx, - }, &ops); -} - -fn mirCmovcc(lower: *Lower, inst: Mir.Inst) Error!void { - const data: struct { cc: bits.Condition, ops: [2]Operand } = switch (inst.ops) { - .rr_cc => .{ .cc = inst.data.rr_cc.cc, .ops = .{ - .{ .reg = inst.data.rr_cc.r1 }, - .{ .reg = inst.data.rr_cc.r2 }, - } }, - .rm_sib_cc, .rm_rip_cc => .{ .cc = inst.data.rx_cc.cc, .ops = .{ - .{ .reg = inst.data.rx_cc.r }, - .{ .mem = lower.mem(inst.ops, inst.data.rx_cc.payload) }, - } }, - else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - }; - switch (data.cc) { - else => |cc| try lower.emitInst(.none, mnem_cc(.cmov, cc), &data.ops), - .z_and_np => { - try lower.emitInst(.none, mnem_cc(.cmov, .nz), &.{ data.ops[1], data.ops[0] }); - try lower.emitInst(.none, mnem_cc(.cmov, .np), &data.ops); + .extern_fn_reloc => &.{ + .{ .imm = lower.reloc(.{ .linker_extern_fn = inst.data.reloc }) }, }, - .nz_or_p => { - try lower.emitInst(.none, mnem_cc(.cmov, .nz), &data.ops); - try lower.emitInst(.none, mnem_cc(.cmov, .p), &data.ops); + .got_reloc, .direct_reloc, .import_reloc, .tlv_reloc => ops: { + const reg = inst.data.rx.r1; + const extra = lower.mir.extraData(Mir.Reloc, inst.data.rx.payload).data; + _ = lower.reloc(switch (inst.ops) { + .got_reloc => .{ .linker_got = extra }, + .direct_reloc => .{ .linker_direct = extra }, + .import_reloc => .{ .linker_import = extra }, + .tlv_reloc => .{ .linker_tlv = extra }, + else => unreachable, + }); + break :ops &.{ + .{ .reg = reg }, + .{ .mem = Memory.rip(Memory.PtrSize.fromBitSize(reg.bitSize()), 0) }, + }; }, - } -} - -fn mirSetcc(lower: *Lower, inst: Mir.Inst) Error!void { - const data: struct { cc: bits.Condition, ops: [2]Operand } = switch (inst.ops) { - .r_cc => .{ .cc = inst.data.r_cc.cc, .ops = .{ - .{ .reg = inst.data.r_cc.r }, - .{ .reg = inst.data.r_cc.scratch }, - } }, - .m_sib_cc, .m_rip_cc => .{ .cc = inst.data.x_cc.cc, .ops = .{ - .{ .mem = lower.mem(inst.ops, inst.data.x_cc.payload) }, - .{ .reg = inst.data.x_cc.scratch }, - } }, else => return lower.fail("TODO lower {s} {s}", .{ @tagName(inst.tag), @tagName(inst.ops) }), - }; - switch (data.cc) { - else => |cc| try lower.emitInst(.none, mnem_cc(.set, cc), data.ops[0..1]), - .z_and_np => { - try lower.emitInst(.none, mnem_cc(.set, .z), data.ops[0..1]); - try lower.emitInst(.none, mnem_cc(.set, .np), data.ops[1..2]); - try lower.emitInst(.none, .@"and", data.ops[0..2]); - }, - .nz_or_p => { - try lower.emitInst(.none, mnem_cc(.set, .nz), data.ops[0..1]); - try lower.emitInst(.none, mnem_cc(.set, .p), data.ops[1..2]); - try lower.emitInst(.none, .@"or", data.ops[0..2]); - }, - } -} - -fn mirJcc(lower: *Lower, index: Mir.Inst.Index, inst: Mir.Inst) Error!void { - switch (inst.data.inst_cc.cc) { - else => |cc| try lower.emitInstWithReloc(.none, mnem_cc(.j, cc), &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .inst = inst.data.inst_cc.inst }), - .z_and_np => { - try lower.emitInstWithReloc(.none, mnem_cc(.j, .nz), &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .inst = index + 1 }); - try lower.emitInstWithReloc(.none, mnem_cc(.j, .np), &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .inst = inst.data.inst_cc.inst }); - }, - .nz_or_p => { - try lower.emitInstWithReloc(.none, mnem_cc(.j, .nz), &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .inst = inst.data.inst_cc.inst }); - try lower.emitInstWithReloc(.none, mnem_cc(.j, .p), &.{ - .{ .imm = Immediate.s(0) }, - }, .{ .inst = inst.data.inst_cc.inst }); - }, - } + }); } -fn mirRegisterList(lower: *Lower, comptime mnemonic: Mnemonic, inst: Mir.Inst) Error!void { - const reg_list = Mir.RegisterList.fromInt(inst.data.payload); +fn pushPopRegList(lower: *Lower, comptime mnemonic: Mnemonic, inst: Mir.Inst) Error!void { const callee_preserved_regs = abi.getCalleePreservedRegs(lower.target.*); - var it = reg_list.iterator(.{ .direction = switch (mnemonic) { + var it = inst.data.reg_list.iterator(.{ .direction = switch (mnemonic) { .push => .reverse, .pop => .forward, else => unreachable, } }); - while (it.next()) |i| try lower.emitInst(.none, mnemonic, &.{.{ .reg = callee_preserved_regs[i] }}); -} - -fn mirLinker(lower: *Lower, mnemonic: Mnemonic, inst: Mir.Inst) Error!void { - const reloc = lower.mir.extraData(Mir.Reloc, inst.data.rx.payload).data; - try lower.emitInstWithReloc(.none, mnemonic, &.{ - .{ .reg = inst.data.rx.r }, - .{ .mem = Memory.rip(Memory.PtrSize.fromBitSize(inst.data.rx.r.bitSize()), 0) }, - }, switch (inst.ops) { - .got_reloc => .{ .linker_got = reloc }, - .direct_reloc => .{ .linker_direct = reloc }, - .import_reloc => .{ .linker_import = reloc }, - .tlv_reloc => .{ .linker_tlv = reloc }, - else => unreachable, - }); + while (it.next()) |i| try lower.emit(.none, mnemonic, &.{.{ + .reg = callee_preserved_regs[i], + }}); } const abi = @import("abi.zig"); diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 442cfabebb..951a0c5d4d 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -32,6 +32,210 @@ pub const Inst = struct { pub const Index = u32; + pub const Fixes = enum(u8) { + /// ___ + @"_", + + /// ___ Above + _a, + /// ___ Above Or Equal + _ae, + /// ___ Below + _b, + /// ___ Below Or Equal + _be, + /// ___ Carry + _c, + /// ___ Equal + _e, + /// ___ Greater + _g, + /// ___ Greater Or Equal + _ge, + /// ___ Less + _l, + /// ___ Less Or Equal + _le, + /// ___ Not Above + _na, + /// ___ Not Above Or Equal + _nae, + /// ___ Not Below + _nb, + /// ___ Not Below Or Equal + _nbe, + /// ___ Not Carry + _nc, + /// ___ Not Equal + _ne, + /// ___ Not Greater + _ng, + /// ___ Not Greater Or Equal + _nge, + /// ___ Not Less + _nl, + /// ___ Not Less Or Equal + _nle, + /// ___ Not Overflow + _no, + /// ___ Not Parity + _np, + /// ___ Not Sign + _ns, + /// ___ Not Zero + _nz, + /// ___ Overflow + _o, + /// ___ Parity + _p, + /// ___ Parity Even + _pe, + /// ___ Parity Odd + _po, + /// ___ Sign + _s, + /// ___ Zero + _z, + + /// ___ String + //_s, + /// ___ String Byte + _sb, + /// ___ String Word + _sw, + /// ___ String Doubleword + _sd, + /// ___ String Quadword + _sq, + + /// Repeat ___ String + @"rep _s", + /// Repeat ___ String Byte + @"rep _sb", + /// Repeat ___ String Word + @"rep _sw", + /// Repeat ___ String Doubleword + @"rep _sd", + /// Repeat ___ String Quadword + @"rep _sq", + + /// Repeat Equal ___ String + @"repe _s", + /// Repeat Equal ___ String Byte + @"repe _sb", + /// Repeat Equal ___ String Word + @"repe _sw", + /// Repeat Equal ___ String Doubleword + @"repe _sd", + /// Repeat Equal ___ String Quadword + @"repe _sq", + + /// Repeat Not Equal ___ String + @"repne _s", + /// Repeat Not Equal ___ String Byte + @"repne _sb", + /// Repeat Not Equal ___ String Word + @"repne _sw", + /// Repeat Not Equal ___ String Doubleword + @"repne _sd", + /// Repeat Not Equal ___ String Quadword + @"repne _sq", + + /// Repeat Not Zero ___ String + @"repnz _s", + /// Repeat Not Zero ___ String Byte + @"repnz _sb", + /// Repeat Not Zero ___ String Word + @"repnz _sw", + /// Repeat Not Zero ___ String Doubleword + @"repnz _sd", + /// Repeat Not Zero ___ String Quadword + @"repnz _sq", + + /// Repeat Zero ___ String + @"repz _s", + /// Repeat Zero ___ String Byte + @"repz _sb", + /// Repeat Zero ___ String Word + @"repz _sw", + /// Repeat Zero ___ String Doubleword + @"repz _sd", + /// Repeat Zero ___ String Quadword + @"repz _sq", + + /// Locked ___ + @"lock _", + /// ___ 8 Bytes + _8b, + /// Locked ___ 8 Bytes + @"lock _8b", + /// ___ 16 Bytes + _16b, + /// Locked ___ 16 Bytes + @"lock _16b", + + /// Packed ___ + p_, + /// Packed ___ Byte + p_b, + /// Packed ___ Word + p_w, + /// Packed ___ Doubleword + p_d, + /// Packed ___ Quadword + p_q, + /// Packed ___ Double Quadword + p_dq, + + /// ___ Scalar Single-Precision Values + _ss, + /// ___ Packed Single-Precision Values + _ps, + /// ___ Scalar Double-Precision Values + //_sd, + /// ___ Packed Double-Precision Values + _pd, + + /// VEX-Encoded ___ + v_, + /// VEX-Encoded Packed ___ + vp_, + /// VEX-Encoded Packed ___ Byte + vp_b, + /// VEX-Encoded Packed ___ Word + vp_w, + /// VEX-Encoded Packed ___ Doubleword + vp_d, + /// VEX-Encoded Packed ___ Quadword + vp_q, + /// VEX-Encoded Packed ___ Double Quadword + vp_dq, + /// VEX-Encoded ___ Scalar Single-Precision Values + v_ss, + /// VEX-Encoded ___ Packed Single-Precision Values + v_ps, + /// VEX-Encoded ___ Scalar Double-Precision Values + v_sd, + /// VEX-Encoded ___ Packed Double-Precision Values + v_pd, + + /// Mask ___ Byte + k_b, + /// Mask ___ Word + k_w, + /// Mask ___ Doubleword + k_d, + /// Mask ___ Quadword + k_q, + + pub fn fromCondition(cc: bits.Condition) Fixes { + return switch (cc) { + inline else => |cc_tag| @field(Fixes, "_" ++ @tagName(cc_tag)), + .z_and_np, .nz_or_p => unreachable, + }; + } + }; + pub const Tag = enum(u8) { /// Add with carry adc, @@ -57,22 +261,24 @@ pub const Inst = struct { call, /// Convert byte to word cbw, - /// Convert word to doubleword - cwde, - /// Convert doubleword to quadword - cdqe, - /// Convert word to doubleword - cwd, /// Convert doubleword to quadword cdq, /// Convert doubleword to quadword - cqo, + cdqe, + /// Conditional move + cmov, /// Logical compare + /// Compare string cmp, /// Compare and exchange - cmpxchg, /// Compare and exchange bytes - cmpxchgb, + cmpxchg, + /// Convert doubleword to quadword + cqo, + /// Convert word to doubleword + cwd, + /// Convert word to doubleword + cwde, /// Unsigned division div, /// Store integer with truncation @@ -85,10 +291,14 @@ pub const Inst = struct { imul, /// int3, + /// Conditional jump + j, /// Jump jmp, /// Load effective address lea, + /// Load string + lod, /// Load fence lfence, /// Count the number of leading zero bits @@ -96,6 +306,7 @@ pub const Inst = struct { /// Memory fence mfence, /// Move + /// Move data from string to string mov, /// Move data after swapping bytes movbe, @@ -105,6 +316,8 @@ pub const Inst = struct { movq, /// Move with sign extension movsx, + /// Move with sign extension + movsxd, /// Move with zero extension movzx, /// Multiply @@ -139,6 +352,10 @@ pub const Inst = struct { sar, /// Integer subtraction with borrow sbb, + /// Scan string + sca, + /// Set byte on condition + set, /// Store fence sfence, /// Logical shift left @@ -151,6 +368,8 @@ pub const Inst = struct { shrd, /// Subtract sub, + /// Store string + sto, /// Syscall syscall, /// Test condition @@ -505,57 +724,10 @@ pub const Inst = struct { /// Fused multiply-add of scalar single-precision floating-point values vfmadd231ss, - /// Compare string operands - cmps, - /// Load string - lods, - /// Move data from string to string - movs, - /// Scan string - scas, - /// Store string - stos, - - /// Conditional move - cmovcc, - /// Conditional jump - jcc, - /// Set byte on condition - setcc, - - /// Mov absolute to/from memory wrt segment register to/from rax - mov_moffs, - - /// Jump with relocation to another local MIR instruction - /// Uses `inst` payload. - jmp_reloc, - - /// Call to an extern symbol via linker relocation. - /// Uses `relocation` payload. - call_extern, - - /// Load effective address of a symbol not yet allocated in VM. - lea_linker, - /// Move address of a symbol not yet allocated in VM. - mov_linker, - - /// End of prologue - dbg_prologue_end, - /// Start of epilogue - dbg_epilogue_begin, - /// Update debug line - /// Uses `line_column` payload containing the line and column. - dbg_line, - /// Push registers - /// Uses `payload` payload containing `RegisterList.asInt` directly. - push_regs, - /// Pop registers - /// Uses `payload` payload containing `RegisterList.asInt` directly. - pop_regs, - - /// Tombstone - /// Emitter should skip this instruction. - dead, + /// A pseudo instruction that requires special lowering. + /// This should be the only tag in this enum that doesn't + /// directly correspond to one or more instruction mnemonics. + pseudo, }; pub const Ops = enum(u8) { @@ -579,12 +751,6 @@ pub const Inst = struct { /// Register, register, immediate (unsigned) operands. /// Uses `rri` payload. rri_u, - /// Register with condition code (CC). - /// Uses `r_cc` payload. - r_cc, - /// Register, register with condition code (CC). - /// Uses `rr_cc` payload. - rr_cc, /// Register, immediate (sign-extended) operands. /// Uses `ri` payload. ri_s, @@ -609,12 +775,6 @@ pub const Inst = struct { /// Register, memory (RIP) operands. /// Uses `rx` payload. rm_rip, - /// Register, memory (SIB) operands with condition code (CC). - /// Uses `rx_cc` payload. - rm_sib_cc, - /// Register, memory (RIP) operands with condition code (CC). - /// Uses `rx_cc` payload. - rm_rip_cc, /// Register, memory (SIB), immediate (byte) operands. /// Uses `rix` payload with extra data of type `MemorySib`. rmi_sib, @@ -634,17 +794,11 @@ pub const Inst = struct { /// Uses `rix` payload with extra data of type `MemoryRip`. rmi_rip, /// Single memory (SIB) operand. - /// Uses `payload` with extra data of type `MemorySib`. + /// Uses `x` with extra data of type `MemorySib`. m_sib, /// Single memory (RIP) operand. - /// Uses `payload` with extra data of type `MemoryRip`. + /// Uses `x` with extra data of type `MemoryRip`. m_rip, - /// Single memory (SIB) operand with condition code (CC). - /// Uses `x_cc` with extra data of type `MemorySib`. - m_sib_cc, - /// Single memory (RIP) operand with condition code (CC). - /// Uses `x_cc` with extra data of type `MemoryRip`. - m_rip_cc, /// Memory (SIB), immediate (unsigned) operands. /// Uses `ix` payload with extra data of type `MemorySib`. mi_sib_u, @@ -676,49 +830,17 @@ pub const Inst = struct { /// Uses `rix` payload with extra data of type `MemoryRip`. mri_rip, /// Rax, Memory moffs. - /// Uses `payload` with extra data of type `MemoryMoffs`. + /// Uses `x` with extra data of type `MemoryMoffs`. rax_moffs, /// Memory moffs, rax. - /// Uses `payload` with extra data of type `MemoryMoffs`. + /// Uses `x` with extra data of type `MemoryMoffs`. moffs_rax, - /// Single memory (SIB) operand with lock prefix. - /// Uses `payload` with extra data of type `MemorySib`. - lock_m_sib, - /// Single memory (RIP) operand with lock prefix. - /// Uses `payload` with extra data of type `MemoryRip`. - lock_m_rip, - /// Memory (SIB), immediate (unsigned) operands with lock prefix. - /// Uses `xi` payload with extra data of type `MemorySib`. - lock_mi_sib_u, - /// Memory (RIP), immediate (unsigned) operands with lock prefix. - /// Uses `xi` payload with extra data of type `MemoryRip`. - lock_mi_rip_u, - /// Memory (SIB), immediate (sign-extend) operands with lock prefix. - /// Uses `xi` payload with extra data of type `MemorySib`. - lock_mi_sib_s, - /// Memory (RIP), immediate (sign-extend) operands with lock prefix. - /// Uses `xi` payload with extra data of type `MemoryRip`. - lock_mi_rip_s, - /// Memory (SIB), register operands with lock prefix. - /// Uses `rx` payload with extra data of type `MemorySib`. - lock_mr_sib, - /// Memory (RIP), register operands with lock prefix. - /// Uses `rx` payload with extra data of type `MemoryRip`. - lock_mr_rip, - /// Memory moffs, rax with lock prefix. - /// Uses `payload` with extra data of type `MemoryMoffs`. - lock_moffs_rax, /// References another Mir instruction directly. /// Uses `inst` payload. inst, - /// References another Mir instruction directly with condition code (CC). - /// Uses `inst_cc` payload. - inst_cc, - /// String repeat and width - /// Uses `string` payload. - string, + /// Linker relocation - external function. /// Uses `reloc` payload. - reloc, + extern_fn_reloc, /// Linker relocation - GOT indirection. /// Uses `rx` payload with extra data of type `Reloc`. got_reloc, @@ -731,74 +853,125 @@ pub const Inst = struct { /// Linker relocation - threadlocal variable via GOT indirection. /// Uses `rx` payload with extra data of type `Reloc`. tlv_reloc, + + // Pseudo instructions: + + /// Conditional move if zero flag set and parity flag not set + /// Clobbers the source operand! + /// Uses `rr` payload. + pseudo_cmov_z_and_np_rr, + /// Conditional move if zero flag not set or parity flag set + /// Uses `rr` payload. + pseudo_cmov_nz_or_p_rr, + /// Conditional move if zero flag not set or parity flag set + /// Uses `rx` payload. + pseudo_cmov_nz_or_p_rm_sib, + /// Conditional move if zero flag not set or parity flag set + /// Uses `rx` payload. + pseudo_cmov_nz_or_p_rm_rip, + /// Set byte if zero flag set and parity flag not set + /// Requires a scratch register! + /// Uses `r_scratch` payload. + pseudo_set_z_and_np_r, + /// Set byte if zero flag set and parity flag not set + /// Requires a scratch register! + /// Uses `x_scratch` payload. + pseudo_set_z_and_np_m_sib, + /// Set byte if zero flag set and parity flag not set + /// Requires a scratch register! + /// Uses `x_scratch` payload. + pseudo_set_z_and_np_m_rip, + /// Set byte if zero flag not set or parity flag set + /// Requires a scratch register! + /// Uses `r_scratch` payload. + pseudo_set_nz_or_p_r, + /// Set byte if zero flag not set or parity flag set + /// Requires a scratch register! + /// Uses `x_scratch` payload. + pseudo_set_nz_or_p_m_sib, + /// Set byte if zero flag not set or parity flag set + /// Requires a scratch register! + /// Uses `x_scratch` payload. + pseudo_set_nz_or_p_m_rip, + /// Jump if zero flag set and parity flag not set + /// Uses `inst` payload. + pseudo_j_z_and_np_inst, + /// Jump if zero flag not set or parity flag set + /// Uses `inst` payload. + pseudo_j_nz_or_p_inst, + + /// Push registers + /// Uses `reg_list` payload. + pseudo_push_reg_list, + /// Pop registers + /// Uses `reg_list` payload. + pseudo_pop_reg_list, + + /// End of prologue + pseudo_dbg_prologue_end_none, + /// Update debug line + /// Uses `line_column` payload. + pseudo_dbg_line_line_column, + /// Start of epilogue + pseudo_dbg_epilogue_begin_none, + + /// Tombstone + /// Emitter should skip this instruction. + pseudo_dead_none, }; pub const Data = union { + none: struct { + fixes: Fixes = ._, + }, /// References another Mir instruction. - inst: Index, - /// Another instruction with condition code (CC). - /// Used by `jcc`. - inst_cc: struct { - /// Another instruction. + inst: struct { + fixes: Fixes = ._, inst: Index, - /// A condition code for use with EFLAGS register. - cc: bits.Condition, }, /// A 32-bit immediate value. - i: u32, - r: Register, + i: struct { + fixes: Fixes = ._, + i: u32, + }, + r: struct { + fixes: Fixes = ._, + r1: Register, + }, rr: struct { + fixes: Fixes = ._, r1: Register, r2: Register, }, rrr: struct { + fixes: Fixes = ._, r1: Register, r2: Register, r3: Register, }, rrri: struct { + fixes: Fixes = ._, r1: Register, r2: Register, r3: Register, i: u8, }, rri: struct { + fixes: Fixes = ._, r1: Register, r2: Register, i: u32, }, - /// Condition code (CC), followed by custom payload found in extra. - x_cc: struct { - scratch: Register, - cc: bits.Condition, - payload: u32, - }, - /// Register with condition code (CC). - r_cc: struct { - r: Register, - scratch: Register, - cc: bits.Condition, - }, - /// Register, register with condition code (CC). - rr_cc: struct { - r1: Register, - r2: Register, - cc: bits.Condition, - }, /// Register, immediate. ri: struct { - r: Register, + fixes: Fixes = ._, + r1: Register, i: u32, }, /// Register, followed by custom payload found in extra. rx: struct { - r: Register, - payload: u32, - }, - /// Register with condition code (CC), followed by custom payload found in extra. - rx_cc: struct { - r: Register, - cc: bits.Condition, + fixes: Fixes = ._, + r1: Register, payload: u32, }, /// Immediate, followed by Custom payload found in extra. @@ -808,39 +981,54 @@ pub const Inst = struct { }, /// Register, register, followed by Custom payload found in extra. rrx: struct { + fixes: Fixes = ._, r1: Register, r2: Register, payload: u32, }, /// Register, byte immediate, followed by Custom payload found in extra. rix: struct { - r: Register, + fixes: Fixes = ._, + r1: Register, i: u8, payload: u32, }, /// Register, register, byte immediate, followed by Custom payload found in extra. rrix: struct { + fixes: Fixes = ._, r1: Register, r2: Register, i: u8, payload: u32, }, - /// String instruction prefix and width. - string: struct { - repeat: bits.StringRepeat, - width: bits.StringWidth, + /// Register, scratch register + r_scratch: struct { + fixes: Fixes = ._, + r1: Register, + scratch_reg: Register, + }, + /// Scratch register, followed by Custom payload found in extra. + x_scratch: struct { + fixes: Fixes = ._, + scratch_reg: Register, + payload: u32, + }, + /// Custom payload found in extra. + x: struct { + fixes: Fixes = ._, + payload: u32, }, /// Relocation for the linker where: /// * `atom_index` is the index of the source /// * `sym_index` is the index of the target - relocation: Reloc, + reloc: Reloc, /// Debug line and column position line_column: struct { line: u32, column: u32, }, - /// Index into `extra`. Meaning of what can be found there is context-dependent. - payload: u32, + /// Register list + reg_list: RegisterList, }; // Make sure we don't accidentally make instructions bigger than expected. @@ -852,6 +1040,7 @@ pub const Inst = struct { } }; +/// A linker symbol not yet allocated in VM. pub const Reloc = struct { /// Index of the containing atom. atom_index: u32, @@ -887,16 +1076,6 @@ pub const RegisterList = struct { return self.bitset.iterator(options); } - pub fn asInt(self: Self) u32 { - return self.bitset.mask; - } - - pub fn fromInt(mask: u32) Self { - return .{ - .bitset = BitSet{ .mask = @intCast(BitSet.MaskInt, mask) }, - }; - } - pub fn count(self: Self) u32 { return @intCast(u32, self.bitset.count()); } diff --git a/src/arch/x86_64/bits.zig b/src/arch/x86_64/bits.zig index b73a37d6cb..3343f280b9 100644 --- a/src/arch/x86_64/bits.zig +++ b/src/arch/x86_64/bits.zig @@ -6,9 +6,6 @@ const Allocator = std.mem.Allocator; const ArrayList = std.ArrayList; const DW = std.dwarf; -pub const StringRepeat = enum(u3) { none, rep, repe, repz, repne, repnz }; -pub const StringWidth = enum(u2) { b, w, d, q }; - /// EFLAGS condition codes pub const Condition = enum(u5) { /// above -- cgit v1.2.3 From c23e80e671686278ea2ea23d164a2c0839ca372c Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Tue, 9 May 2023 03:15:27 -0400 Subject: x86_64: implement `@splat` --- src/arch/x86_64/CodeGen.zig | 208 +++++++++++++++++++++++++++++++++++++++++- src/arch/x86_64/Encoding.zig | 13 ++- src/arch/x86_64/Lower.zig | 2 + src/arch/x86_64/Mir.zig | 15 +++ src/arch/x86_64/encodings.zig | 41 ++++++++- 5 files changed, 270 insertions(+), 9 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index d1bc23b826..29232b5284 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -8561,7 +8561,8 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag { }, 32 => switch (ty.vectorLen()) { 1 => return if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov }, - 2...4 => return if (self.hasFeature(.avx)) + 2 => return if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov }, + 3...4 => return if (self.hasFeature(.avx)) if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu }, 5...8 => if (self.hasFeature(.avx)) @@ -8577,6 +8578,14 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.FixedTag { return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }, else => {}, }, + 128 => switch (ty.vectorLen()) { + 1 => return if (self.hasFeature(.avx)) + if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } + else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu }, + 2 => if (self.hasFeature(.avx)) + return if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu }, + else => {}, + }, else => {}, }, else => {}, @@ -9939,9 +9948,200 @@ fn airErrorName(self: *Self, inst: Air.Inst.Index) !void { fn airSplat(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; - _ = ty_op; - return self.fail("TODO implement airSplat for x86_64", .{}); - //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); + const vector_ty = self.air.typeOfIndex(inst); + const dst_rc = regClassForType(vector_ty); + const scalar_ty = vector_ty.scalarType(); + + const src_mcv = try self.resolveInst(ty_op.operand); + const result: MCValue = result: { + switch (scalar_ty.zigTypeTag()) { + else => {}, + .Float => switch (scalar_ty.floatBits(self.target.*)) { + 32 => switch (vector_ty.vectorLen()) { + 1 => { + if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv; + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + try self.genSetReg(dst_reg, scalar_ty, src_mcv); + break :result .{ .register = dst_reg }; + }, + 2...4 => { + if (self.hasFeature(.avx)) { + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .{ .v_ss, .broadcast }, + dst_reg.to128(), + src_mcv.mem(.dword), + ) else { + const src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(scalar_ty, src_mcv); + try self.asmRegisterRegisterRegisterImmediate( + .{ .v_ps, .shuf }, + dst_reg.to128(), + src_reg.to128(), + src_reg.to128(), + Immediate.u(0), + ); + } + break :result .{ .register = dst_reg }; + } else { + const dst_mcv = if (src_mcv.isRegister() and + self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + src_mcv + else + try self.copyToRegisterWithInstTracking(inst, scalar_ty, src_mcv); + const dst_reg = dst_mcv.getReg().?; + try self.asmRegisterRegisterImmediate( + .{ ._ps, .shuf }, + dst_reg.to128(), + dst_reg.to128(), + Immediate.u(0), + ); + break :result dst_mcv; + } + }, + 5...8 => if (self.hasFeature(.avx)) { + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .{ .v_ss, .broadcast }, + dst_reg.to256(), + src_mcv.mem(.dword), + ) else { + const src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(scalar_ty, src_mcv); + if (self.hasFeature(.avx2)) try self.asmRegisterRegister( + .{ .v_ss, .broadcast }, + dst_reg.to256(), + src_reg.to128(), + ) else { + try self.asmRegisterRegisterRegisterImmediate( + .{ .v_ps, .shuf }, + dst_reg.to128(), + src_reg.to128(), + src_reg.to128(), + Immediate.u(0), + ); + try self.asmRegisterRegisterRegisterImmediate( + .{ .v_f128, .insert }, + dst_reg.to256(), + dst_reg.to256(), + dst_reg.to128(), + Immediate.u(1), + ); + } + } + break :result .{ .register = dst_reg }; + }, + else => {}, + }, + 64 => switch (vector_ty.vectorLen()) { + 1 => { + if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv; + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + try self.genSetReg(dst_reg, scalar_ty, src_mcv); + break :result .{ .register = dst_reg }; + }, + 2 => { + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + if (self.hasFeature(.sse3)) { + if (src_mcv.isMemory()) try self.asmRegisterMemory( + if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup }, + dst_reg.to128(), + src_mcv.mem(.qword), + ) else try self.asmRegisterRegister( + if (self.hasFeature(.avx)) .{ .v_, .movddup } else .{ ._, .movddup }, + dst_reg.to128(), + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(scalar_ty, src_mcv)).to128(), + ); + break :result .{ .register = dst_reg }; + } else try self.asmRegisterRegister( + .{ ._ps, .movlh }, + dst_reg.to128(), + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(scalar_ty, src_mcv)).to128(), + ); + }, + 3...4 => if (self.hasFeature(.avx)) { + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .{ .v_sd, .broadcast }, + dst_reg.to256(), + src_mcv.mem(.qword), + ) else { + const src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(scalar_ty, src_mcv); + if (self.hasFeature(.avx2)) try self.asmRegisterRegister( + .{ .v_sd, .broadcast }, + dst_reg.to256(), + src_reg.to128(), + ) else { + try self.asmRegisterRegister( + .{ .v_, .movddup }, + dst_reg.to128(), + src_reg.to128(), + ); + try self.asmRegisterRegisterRegisterImmediate( + .{ .v_f128, .insert }, + dst_reg.to256(), + dst_reg.to256(), + dst_reg.to128(), + Immediate.u(1), + ); + } + } + break :result .{ .register = dst_reg }; + }, + else => {}, + }, + 128 => switch (vector_ty.vectorLen()) { + 1 => { + if (self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) break :result src_mcv; + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + try self.genSetReg(dst_reg, scalar_ty, src_mcv); + break :result .{ .register = dst_reg }; + }, + 2 => if (self.hasFeature(.avx)) { + const dst_reg = try self.register_manager.allocReg(inst, dst_rc); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .{ .v_f128, .broadcast }, + dst_reg.to256(), + src_mcv.mem(.xword), + ) else { + const src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(scalar_ty, src_mcv); + try self.asmRegisterRegisterRegisterImmediate( + .{ .v_f128, .insert }, + dst_reg.to256(), + src_reg.to256(), + src_reg.to128(), + Immediate.u(1), + ); + } + break :result .{ .register = dst_reg }; + }, + else => {}, + }, + 16, 80 => {}, + else => unreachable, + }, + } + return self.fail("TODO implement airSplat for {}", .{ + vector_ty.fmt(self.bin_file.options.module.?), + }); + }; + return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } fn airSelect(self: *Self, inst: Air.Inst.Index) !void { diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index b6b49e8939..073128b85e 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -270,10 +270,12 @@ pub const Mnemonic = enum { divps, divss, maxps, maxss, minps, minss, - movaps, movhlps, movss, movups, + movaps, movhlps, movlhps, + movss, movups, mulps, mulss, orps, pextrw, pinsrw, + shufps, sqrtps, sqrtss, subps, subss, ucomiss, @@ -296,6 +298,7 @@ pub const Mnemonic = enum { psrld, psrlq, psrlw, punpckhbw, punpckhdq, punpckhqdq, punpckhwd, punpcklbw, punpckldq, punpcklqdq, punpcklwd, + shufpd, sqrtpd, sqrtsd, subpd, subsd, ucomisd, @@ -303,17 +306,22 @@ pub const Mnemonic = enum { // SSE3 movddup, movshdup, movsldup, // SSE4.1 + extractps, + insertps, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, roundpd, roundps, roundsd, roundss, // AVX vaddpd, vaddps, vaddsd, vaddss, + vbroadcastf128, vbroadcastsd, vbroadcastss, vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd, vdivpd, vdivps, vdivsd, vdivss, + vextractf128, vextractps, + vinsertf128, vinsertps, vmaxpd, vmaxps, vmaxsd, vmaxss, vminpd, vminps, vminsd, vminss, vmovapd, vmovaps, - vmovddup, vmovhlps, + vmovddup, vmovhlps, vmovlhps, vmovsd, vmovshdup, vmovsldup, vmovss, @@ -326,6 +334,7 @@ pub const Mnemonic = enum { vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, vroundpd, vroundps, vroundsd, vroundss, + vshufpd, vshufps, vsqrtpd, vsqrtps, vsqrtsd, vsqrtss, vsubpd, vsubps, vsubsd, vsubss, // F16C diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index c32e7fc974..c893429912 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -300,6 +300,8 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void { else .none, }, mnemonic: { + @setEvalBranchQuota(2_000); + comptime var max_len = 0; inline for (@typeInfo(Mnemonic).Enum.fields) |field| max_len = @max(field.name.len, max_len); var buf: [max_len]u8 = undefined; diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 0a7b5597b3..18c2903045 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -256,6 +256,8 @@ pub const Inst = struct { v_sd, /// VEX-Encoded ___ Packed Double-Precision Values v_pd, + /// VEX-Encoded ___ 128-Bits Of Floating-Point Data + v_f128, /// Mask ___ Byte k_b, @@ -454,6 +456,8 @@ pub const Inst = struct { mova, /// Move packed single-precision floating-point values high to low movhl, + /// Move packed single-precision floating-point values low to high + movlh, /// Move unaligned packed single-precision floating-point values /// Move unaligned packed double-precision floating-point values movu, @@ -488,6 +492,9 @@ pub const Inst = struct { cvtsi2sd, /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value cvtss2sd, + /// Packed interleave shuffle of quadruplets of single-precision floating-point values + /// Packed interleave shuffle of pairs of double-precision floating-point values + shuf, /// Shuffle packed high words shufh, /// Shuffle packed low words @@ -520,12 +527,20 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, + /// Extract packed floating-point values + extract, + /// Insert scalar single-precision floating-point value + /// Insert packed floating-point values + insert, /// Round packed single-precision floating-point values /// Round scalar single-precision floating-point value /// Round packed double-precision floating-point values /// Round scalar double-precision floating-point value round, + /// Load with broadcast floating-point data + broadcast, + /// Convert 16-bit floating-point values to single-precision floating-point values cvtph2ps, /// Convert single-precision floating-point values to 16-bit floating-point values diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 2b9d530c1e..f56f31da7f 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -867,6 +867,8 @@ pub const table = [_]Entry{ .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse }, + .{ .movlhps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .none, .sse }, + .{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse }, .{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse }, @@ -879,14 +881,16 @@ pub const table = [_]Entry{ .{ .orps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .none, .sse }, - .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse }, - - .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse }, + .{ .shufps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .none, .sse }, .{ .sqrtps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x51 }, 0, .none, .sse }, .{ .sqrtss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x51 }, 0, .none, .sse }, + .{ .subps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5c }, 0, .none, .sse }, + + .{ .subss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .none, .sse }, + .{ .ucomiss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0x0f, 0x2e }, 0, .none, .sse }, .{ .xorps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .none, .sse }, @@ -967,6 +971,8 @@ pub const table = [_]Entry{ .{ .punpckldq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 }, .{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 }, + .{ .shufpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .none, .sse2 }, + .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 }, .{ .sqrtsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x51 }, 0, .none, .sse2 }, @@ -990,6 +996,10 @@ pub const table = [_]Entry{ .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 }, // SSE4.1 + .{ .extractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .none, .sse4_1 }, + + .{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 }, + .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 }, .{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 }, .{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 }, @@ -1019,6 +1029,11 @@ pub const table = [_]Entry{ .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx }, + .{ .vbroadcastss, .rm, &.{ .xmm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx }, + .{ .vbroadcastss, .rm, &.{ .ymm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx }, + .{ .vbroadcastsd, .rm, &.{ .ymm, .m64 }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx }, + .{ .vbroadcastf128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x1a }, 0, .vex_256_w0, .avx }, + .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, @@ -1039,6 +1054,14 @@ pub const table = [_]Entry{ .{ .vdivss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .vex_lig_wig, .avx }, + .{ .vextractf128, .mri, &.{ .xmm_m128, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x19 }, 0, .vex_256_w0, .avx }, + + .{ .vextractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .vex_128_wig, .avx }, + + .{ .vinsertf128, .rvmi, &.{ .ymm, .ymm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x18 }, 0, .vex_256_w0, .avx }, + + .{ .vinsertps, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .vex_128_wig, .avx }, + .{ .vmaxpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_128_wig, .avx }, .{ .vmaxpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5f }, 0, .vex_256_wig, .avx }, @@ -1074,6 +1097,8 @@ pub const table = [_]Entry{ .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, + .{ .vmovlhps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .vex_128_wig, .avx }, + .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, @@ -1150,6 +1175,12 @@ pub const table = [_]Entry{ .{ .vroundss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0a }, 0, .vex_lig_wig, .avx }, + .{ .vshufpd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .vex_128_wig, .avx }, + .{ .vshufpd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .vex_256_wig, .avx }, + + .{ .vshufps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .vex_128_wig, .avx }, + .{ .vshufps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x0f, 0xc6 }, 0, .vex_256_wig, .avx }, + .{ .vsqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_128_wig, .avx }, .{ .vsqrtpd, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x51 }, 0, .vex_256_wig, .avx }, @@ -1201,6 +1232,10 @@ pub const table = [_]Entry{ .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w0, .fma }, // AVX2 + .{ .vbroadcastss, .rm, &.{ .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx2 }, + .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, + .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 }, .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 }, .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 }, -- cgit v1.2.3 From e08eab664861461b0adbe7984881f72b5a36a979 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sat, 13 May 2023 14:06:26 -0400 Subject: x86_64: add missing encoding feature requirements --- src/arch/x86_64/Encoding.zig | 3 +++ src/arch/x86_64/encodings.zig | 18 +++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 073128b85e..537a03fa2a 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -691,8 +691,11 @@ pub const Feature = enum { none, avx, avx2, + bmi, f16c, fma, + lzcnt, + popcnt, sse, sse2, sse3, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index f56f31da7f..a7a50867c3 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -354,9 +354,9 @@ pub const table = [_]Entry{ .{ .lodsd, .np, &.{}, &.{ 0xad }, 0, .none, .none }, .{ .lodsq, .np, &.{}, &.{ 0xad }, 0, .long, .none }, - .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .short, .none }, - .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .none }, - .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long, .none }, + .{ .lzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .short, .lzcnt }, + .{ .lzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .none, .lzcnt }, + .{ .lzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbd }, 0, .long, .lzcnt }, .{ .mfence, .np, &.{}, &.{ 0x0f, 0xae, 0xf0 }, 0, .none, .none }, @@ -482,9 +482,9 @@ pub const table = [_]Entry{ .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .short, .none }, .{ .pop, .m, &.{ .rm64 }, &.{ 0x8f }, 0, .none, .none }, - .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .short, .none }, - .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .none }, - .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long, .none }, + .{ .popcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .short, .popcnt }, + .{ .popcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .none, .popcnt }, + .{ .popcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xb8 }, 0, .long, .popcnt }, .{ .push, .o, &.{ .r16 }, &.{ 0x50 }, 0, .short, .none }, .{ .push, .o, &.{ .r64 }, &.{ 0x50 }, 0, .none, .none }, @@ -784,9 +784,9 @@ pub const table = [_]Entry{ .{ .@"test", .mr, &.{ .rm32, .r32 }, &.{ 0x85 }, 0, .none, .none }, .{ .@"test", .mr, &.{ .rm64, .r64 }, &.{ 0x85 }, 0, .long, .none }, - .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .short, .none }, - .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .none }, - .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long, .none }, + .{ .tzcnt, .rm, &.{ .r16, .rm16 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .short, .bmi }, + .{ .tzcnt, .rm, &.{ .r32, .rm32 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .none, .bmi }, + .{ .tzcnt, .rm, &.{ .r64, .rm64 }, &.{ 0xf3, 0x0f, 0xbc }, 0, .long, .bmi }, .{ .ud2, .np, &.{}, &.{ 0x0f, 0x0b }, 0, .none, .none }, -- cgit v1.2.3 From b6d61028508c5b1e1961a124bc17d4d9bda9686f Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sat, 13 May 2023 18:06:16 -0400 Subject: x86_64: reimplement `@floatToInt` --- src/arch/x86_64/CodeGen.zig | 181 +++++++++++++++++++++--------------------- src/arch/x86_64/Encoding.zig | 159 ++++++++++++++++++++----------------- src/arch/x86_64/Mir.zig | 46 +++++++++-- src/arch/x86_64/bits.zig | 83 +++++++++---------- src/arch/x86_64/encodings.zig | 111 ++++++++++++++++++++++---- src/link/Dwarf.zig | 92 +++++++++++++++------ test/behavior/cast.zig | 1 - 7 files changed, 420 insertions(+), 253 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index e4f28e34cf..e5c6925596 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2501,12 +2501,12 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { } } else if (src_bits == 64 and dst_bits == 32) { if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( - .{ .v_, .cvtsd2ss }, + .{ .v_ss, .cvtsd2 }, dst_reg, dst_reg, src_mcv.mem(.qword), ) else try self.asmRegisterRegisterRegister( - .{ .v_, .cvtsd2ss }, + .{ .v_ss, .cvtsd2 }, dst_reg, dst_reg, (if (src_mcv.isRegister()) @@ -2514,11 +2514,11 @@ fn airFptrunc(self: *Self, inst: Air.Inst.Index) !void { else try self.copyToTmpRegister(src_ty, src_mcv)).to128(), ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( - .{ ._, .cvtsd2ss }, + .{ ._ss, .cvtsd2 }, dst_reg, src_mcv.mem(.qword), ) else try self.asmRegisterRegister( - .{ ._, .cvtsd2ss }, + .{ ._ss, .cvtsd2 }, dst_reg, (if (src_mcv.isRegister()) src_mcv.getReg().? @@ -2552,22 +2552,22 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { src_mcv.getReg().? else try self.copyToTmpRegister(src_ty, src_mcv); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, mat_src_reg.to128()); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, mat_src_reg.to128()); switch (dst_bits) { 32 => {}, - 64 => try self.asmRegisterRegisterRegister(.{ .v_, .cvtss2sd }, dst_reg, dst_reg, dst_reg), + 64 => try self.asmRegisterRegisterRegister(.{ .v_sd, .cvtss2 }, dst_reg, dst_reg, dst_reg), else => return self.fail("TODO implement airFpext from {} to {}", .{ src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }), } } else if (src_bits == 32 and dst_bits == 64) { if (self.hasFeature(.avx)) if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( - .{ .v_, .cvtss2sd }, + .{ .v_sd, .cvtss2 }, dst_reg, dst_reg, src_mcv.mem(.dword), ) else try self.asmRegisterRegisterRegister( - .{ .v_, .cvtss2sd }, + .{ .v_sd, .cvtss2 }, dst_reg, dst_reg, (if (src_mcv.isRegister()) @@ -2575,11 +2575,11 @@ fn airFpext(self: *Self, inst: Air.Inst.Index) !void { else try self.copyToTmpRegister(src_ty, src_mcv)).to128(), ) else if (src_mcv.isMemory()) try self.asmRegisterMemory( - .{ ._, .cvtss2sd }, + .{ ._sd, .cvtss2 }, dst_reg, src_mcv.mem(.dword), ) else try self.asmRegisterRegister( - .{ ._, .cvtss2sd }, + .{ ._sd, .cvtss2 }, dst_reg, (if (src_mcv.isRegister()) src_mcv.getReg().? @@ -4789,7 +4789,6 @@ fn genRound(self: *Self, ty: Type, dst_reg: Register, src_mcv: MCValue, mode: u4 })) |tag| tag else return self.fail("TODO implement genRound for {}", .{ ty.fmt(self.bin_file.options.module.?), }); - const abi_size = @intCast(u32, ty.abiSize(self.target.*)); const dst_alias = registerAlias(dst_reg, abi_size); switch (mir_tag[0]) { @@ -4848,7 +4847,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { src_mcv.getReg().? else try self.copyToTmpRegister(ty, src_mcv); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, mat_src_reg.to128()); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, mat_src_reg.to128()); try self.asmRegisterRegisterRegister(.{ .v_ss, .sqrt }, dst_reg, dst_reg, dst_reg); try self.asmRegisterRegisterImmediate( .{ .v_, .cvtps2ph }, @@ -4868,7 +4867,7 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { 16 => if (self.hasFeature(.f16c)) switch (ty.vectorLen()) { 1 => { try self.asmRegisterRegister( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, dst_reg, (if (src_mcv.isRegister()) src_mcv.getReg().? @@ -4892,13 +4891,13 @@ fn airSqrt(self: *Self, inst: Air.Inst.Index) !void { 2...8 => { const wide_reg = registerAlias(dst_reg, abi_size * 2); if (src_mcv.isMemory()) try self.asmRegisterMemory( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, wide_reg, src_mcv.mem(Memory.PtrSize.fromSize( @intCast(u32, @divExact(wide_reg.bitSize(), 16)), )), ) else try self.asmRegisterRegister( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, wide_reg, (if (src_mcv.isRegister()) src_mcv.getReg().? @@ -6347,7 +6346,7 @@ fn genBinOp( else try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), ); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, dst_reg); try self.asmRegisterRegister(.{ .v_, .movshdup }, tmp_reg, dst_reg); try self.asmRegisterRegisterRegister( switch (air_tag) { @@ -6424,7 +6423,7 @@ fn genBinOp( else try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), ); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, dst_reg); try self.asmRegisterRegister(.{ .v_, .movshdup }, tmp_reg, dst_reg); try self.asmRegisterRegisterRegister( switch (air_tag) { @@ -6467,7 +6466,7 @@ fn genBinOp( else try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), ); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, dst_reg); try self.asmRegisterRegisterRegister( .{ .v_ps, .movhl }, tmp_reg, @@ -6501,13 +6500,13 @@ fn genBinOp( const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); defer self.register_manager.unlockReg(tmp_lock); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg, dst_reg); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg, dst_reg); if (src_mcv.isMemory()) try self.asmRegisterMemory( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, tmp_reg, src_mcv.mem(.qword), ) else try self.asmRegisterRegister( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, tmp_reg, (if (src_mcv.isRegister()) src_mcv.getReg().? @@ -6541,13 +6540,13 @@ fn genBinOp( const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); defer self.register_manager.unlockReg(tmp_lock); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, dst_reg.to256(), dst_reg); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, dst_reg.to256(), dst_reg); if (src_mcv.isMemory()) try self.asmRegisterMemory( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, tmp_reg, src_mcv.mem(.xword), ) else try self.asmRegisterRegister( - .{ .v_, .cvtph2ps }, + .{ .v_ps, .cvtph2 }, tmp_reg, (if (src_mcv.isRegister()) src_mcv.getReg().? @@ -7199,13 +7198,13 @@ fn genArgDbgInfo(self: Self, ty: Type, name: [:0]const u8, mcv: MCValue) !void { switch (self.debug_output) { .dwarf => |dw| { const loc: link.File.Dwarf.DeclState.DbgInfoLoc = switch (mcv) { - .register => |reg| .{ .register = reg.dwarfLocOp() }, + .register => |reg| .{ .register = reg.dwarfNum() }, // TODO use a frame index .load_frame => return, //.stack_offset => |off| .{ // .stack = .{ // // TODO handle -fomit-frame-pointer - // .fp_register = Register.rbp.dwarfLocOpDeref(), + // .fp_register = Register.rbp.dwarfNum(), // .offset = -off, // }, //}, @@ -7237,11 +7236,11 @@ fn genVarDbgInfo( switch (self.debug_output) { .dwarf => |dw| { const loc: link.File.Dwarf.DeclState.DbgInfoLoc = switch (mcv) { - .register => |reg| .{ .register = reg.dwarfLocOp() }, + .register => |reg| .{ .register = reg.dwarfNum() }, // TODO use a frame index .load_frame, .lea_frame => return, //=> |off| .{ .stack = .{ - // .fp_register = Register.rbp.dwarfLocOpDeref(), + // .fp_register = Register.rbp.dwarfNum(), // .offset = -off, //} }, .memory => |address| .{ .memory = address }, @@ -7595,7 +7594,7 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { else try self.copyToTmpRegister(ty, src_mcv)).to128(), ); - try self.asmRegisterRegister(.{ .v_, .cvtph2ps }, tmp1_reg, tmp1_reg); + try self.asmRegisterRegister(.{ .v_ps, .cvtph2 }, tmp1_reg, tmp1_reg); try self.asmRegisterRegister(.{ .v_, .movshdup }, tmp2_reg, tmp1_reg); try self.genBinOpMir(.{ ._ss, .ucomi }, ty, tmp1_mcv, tmp2_mcv); } else return self.fail("TODO implement airCmp for {}", .{ @@ -8862,14 +8861,14 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr } }, .register => |src_reg| if (dst_reg.id() != src_reg.id()) try self.asmRegisterRegister( - if ((dst_reg.class() == .floating_point) == (src_reg.class() == .floating_point)) + if ((dst_reg.class() == .sse) == (src_reg.class() == .sse)) switch (ty.zigTypeTag()) { else => .{ ._, .mov }, .Float, .Vector => .{ ._ps, .mova }, } else switch (abi_size) { 2 => return try self.asmRegisterRegisterImmediate( - if (dst_reg.class() == .floating_point) .{ .p_w, .insr } else .{ .p_w, .extr }, + if (dst_reg.class() == .sse) .{ .p_w, .insr } else .{ .p_w, .extr }, registerAlias(dst_reg, 4), registerAlias(src_reg, 4), Immediate.u(0), @@ -9222,7 +9221,7 @@ fn genInlineMemcpyRegisterRegister( try self.asmMemoryRegister( switch (src_reg.class()) { .general_purpose, .segment => .{ ._, .mov }, - .floating_point => .{ ._ss, .mov }, + .sse => .{ ._ss, .mov }, }, Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ .base = dst_reg, .disp = -offset }), registerAlias(src_reg, abi_size), @@ -9388,10 +9387,10 @@ fn airIntToFloat(self: *Self, inst: Air.Inst.Index) !void { }); const src_mcv = try self.resolveInst(ty_op.operand); - const src_reg = switch (src_mcv) { - .register => |reg| reg, - else => try self.copyToTmpRegister(src_ty, src_mcv), - }; + const src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv); const src_lock = self.register_manager.lockRegAssumeUnused(src_reg); defer self.register_manager.unlockReg(src_lock); @@ -9402,23 +9401,23 @@ fn airIntToFloat(self: *Self, inst: Air.Inst.Index) !void { const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg); defer self.register_manager.unlockReg(dst_lock); - try self.asmRegisterRegister(switch (dst_ty.floatBits(self.target.*)) { - 32 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse)) - .{ ._, .cvtsi2ss } - else - return self.fail("TODO implement airIntToFloat from {} to {} without sse", .{ - src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), - }), - 64 => if (Target.x86.featureSetHas(self.target.cpu.features, .sse2)) - .{ ._, .cvtsi2sd } - else - return self.fail("TODO implement airIntToFloat from {} to {} without sse2", .{ - src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), - }), - else => return self.fail("TODO implement airIntToFloat from {} to {}", .{ - src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), - }), - }, dst_reg.to128(), registerAlias(src_reg, src_size)); + const mir_tag = if (@as(?Mir.Inst.FixedTag, switch (dst_ty.zigTypeTag()) { + .Float => switch (dst_ty.floatBits(self.target.*)) { + 32 => if (self.hasFeature(.avx)) .{ .v_ss, .cvtsi2 } else .{ ._ss, .cvtsi2 }, + 64 => if (self.hasFeature(.avx)) .{ .v_sd, .cvtsi2 } else .{ ._sd, .cvtsi2 }, + 16, 80, 128 => null, + else => unreachable, + }, + else => null, + })) |tag| tag else return self.fail("TODO implement airIntToFloat from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }); + const dst_alias = dst_reg.to128(); + const src_alias = registerAlias(src_reg, src_size); + switch (mir_tag[0]) { + .v_ss, .v_sd => try self.asmRegisterRegisterRegister(mir_tag, dst_alias, dst_alias, src_alias), + else => try self.asmRegisterRegister(mir_tag, dst_alias, src_alias), + } return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } @@ -9428,46 +9427,50 @@ fn airFloatToInt(self: *Self, inst: Air.Inst.Index) !void { const src_ty = self.air.typeOf(ty_op.operand); const dst_ty = self.air.typeOfIndex(inst); - const operand = try self.resolveInst(ty_op.operand); - const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*)); - const dst_abi_size = @intCast(u32, dst_ty.abiSize(self.target.*)); + const dst_bits = @intCast(u32, dst_ty.bitSize(self.target.*)); + const dst_signedness = + if (dst_ty.isAbiInt()) dst_ty.intInfo(self.target.*).signedness else .unsigned; - switch (src_abi_size) { - 4, 8 => {}, - else => |size| return self.fail("TODO load ST(0) with abiSize={}", .{size}), - } - if (dst_abi_size > 8) { - return self.fail("TODO convert float with abiSize={}", .{dst_abi_size}); - } + const dst_size = std.math.divCeil(u32, @max(switch (dst_signedness) { + .signed => dst_bits, + .unsigned => dst_bits + 1, + }, 32), 8) catch unreachable; + if (dst_size > 8) return self.fail("TODO implement airFloatToInt from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), + }); - // move float src to ST(0) - const frame_addr: FrameAddr = switch (operand) { - .load_frame => |frame_addr| frame_addr, - else => frame_addr: { - const frame_index = try self.allocFrameIndex(FrameAlloc.initType(src_ty, self.target.*)); - try self.genSetMem(.{ .frame = frame_index }, 0, src_ty, operand); - break :frame_addr .{ .index = frame_index }; - }, - }; - try self.asmMemory( - .{ .f_, .ld }, - Memory.sib(Memory.PtrSize.fromSize(src_abi_size), .{ - .base = .{ .frame = frame_addr.index }, - .disp = frame_addr.off, - }), - ); + const src_mcv = try self.resolveInst(ty_op.operand); + const src_reg = if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(src_ty, src_mcv); + const src_lock = self.register_manager.lockRegAssumeUnused(src_reg); + defer self.register_manager.unlockReg(src_lock); - // convert - const stack_dst = try self.allocRegOrMem(inst, false); - try self.asmMemory( - .{ .f_p, .istt }, - Memory.sib(Memory.PtrSize.fromSize(dst_abi_size), .{ - .base = .{ .frame = stack_dst.load_frame.index }, - .disp = stack_dst.load_frame.off, + const dst_reg = try self.register_manager.allocReg(inst, regClassForType(dst_ty)); + const dst_mcv = MCValue{ .register = dst_reg }; + const dst_lock = self.register_manager.lockRegAssumeUnused(dst_reg); + defer self.register_manager.unlockReg(dst_lock); + + try self.asmRegisterRegister( + if (@as(?Mir.Inst.FixedTag, switch (src_ty.zigTypeTag()) { + .Float => switch (src_ty.floatBits(self.target.*)) { + 32 => if (self.hasFeature(.avx)) .{ .v_, .cvttss2si } else .{ ._, .cvttss2si }, + 64 => if (self.hasFeature(.avx)) .{ .v_, .cvttsd2si } else .{ ._, .cvttsd2si }, + 16, 80, 128 => null, + else => unreachable, + }, + else => null, + })) |tag| tag else return self.fail("TODO implement airFloatToInt from {} to {}", .{ + src_ty.fmt(self.bin_file.options.module.?), dst_ty.fmt(self.bin_file.options.module.?), }), + registerAlias(dst_reg, dst_size), + src_reg.to128(), ); - return self.finishAir(inst, stack_dst, .{ ty_op.operand, .none, .none }); + if (dst_bits < dst_size * 8) try self.truncateRegister(dst_ty, dst_reg); + + return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } fn airCmpxchg(self: *Self, inst: Air.Inst.Index) !void { @@ -10997,13 +11000,13 @@ fn registerAlias(reg: Register, size_bytes: u32) Register { reg.to64() else unreachable, - .floating_point => if (size_bytes <= 16) + .segment, .x87, .mmx => unreachable, + .sse => if (size_bytes <= 16) reg.to128() else if (size_bytes <= 32) reg.to256() else unreachable, - .segment => unreachable, }; } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 537a03fa2a..66a249a3f2 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -233,7 +233,6 @@ pub const Mnemonic = enum { cmpxchg, cmpxchg8b, cmpxchg16b, cqo, cwd, cwde, div, - fisttp, fld, idiv, imul, int3, ja, jae, jb, jbe, jc, jrcxz, je, jg, jge, jl, jle, jna, jnae, jnb, jnbe, jnc, jne, jng, jnge, jnl, jnle, jno, jnp, jns, jnz, jo, jp, jpe, jpo, js, jz, @@ -259,6 +258,8 @@ pub const Mnemonic = enum { @"test", tzcnt, ud2, xadd, xchg, xor, + // X87 + fisttp, fld, // MMX movd, // SSE @@ -266,7 +267,7 @@ pub const Mnemonic = enum { andps, andnps, cmpss, - cvtsi2ss, + cvtpi2ps, cvtps2pi, cvtsi2ss, cvtss2si, cvttps2pi, cvttss2si, divps, divss, maxps, maxss, minps, minss, @@ -285,7 +286,9 @@ pub const Mnemonic = enum { andpd, andnpd, //cmpsd, - cvtsd2ss, cvtsi2sd, cvtss2sd, + cvtdq2pd, cvtdq2ps, cvtpd2dq, cvtpd2pi, cvtpd2ps, cvtpi2pd, + cvtps2dq, cvtps2pd, cvtsd2si, cvtsd2ss, cvtsi2sd, cvtss2sd, + cvttpd2dq, cvttpd2pi, cvttps2dq, cvttsd2si, divpd, divsd, maxpd, maxsd, minpd, minsd, @@ -314,7 +317,10 @@ pub const Mnemonic = enum { // AVX vaddpd, vaddps, vaddsd, vaddss, vbroadcastf128, vbroadcastsd, vbroadcastss, - vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd, + vcvtdq2pd, vcvtdq2ps, vcvtpd2dq, vcvtpd2ps, + vcvtps2dq, vcvtps2pd, vcvtsd2si, vcvtsd2ss, + vcvtsi2sd, vcvtsi2ss, vcvtss2sd, vcvtss2si, + vcvttpd2dq, vcvttps2dq, vcvttsd2si, vcvttss2si, vdivpd, vdivps, vdivsd, vdivss, vextractf128, vextractps, vinsertf128, vinsertps, @@ -377,80 +383,84 @@ pub const Op = enum { m, moffs, sreg, + st, mm, mm_m64, xmm, xmm_m32, xmm_m64, xmm_m128, ymm, ymm_m256, // zig fmt: on pub fn fromOperand(operand: Instruction.Operand) Op { - switch (operand) { - .none => return .none, - - .reg => |reg| { - switch (reg.class()) { - .segment => return .sreg, - .floating_point => return switch (reg.bitSize()) { - 128 => .xmm, - 256 => .ymm, + return switch (operand) { + .none => .none, + + .reg => |reg| switch (reg.class()) { + .general_purpose => if (reg.to64() == .rax) + switch (reg) { + .al => .al, + .ax => .ax, + .eax => .eax, + .rax => .rax, else => unreachable, - }, - .general_purpose => { - if (reg.to64() == .rax) return switch (reg) { - .al => .al, - .ax => .ax, - .eax => .eax, - .rax => .rax, - else => unreachable, - }; - if (reg == .cl) return .cl; - return switch (reg.bitSize()) { - 8 => .r8, - 16 => .r16, - 32 => .r32, - 64 => .r64, - else => unreachable, - }; - }, - } + } + else if (reg == .cl) + .cl + else switch (reg.bitSize()) { + 8 => .r8, + 16 => .r16, + 32 => .r32, + 64 => .r64, + else => unreachable, + }, + .segment => .sreg, + .x87 => .st, + .mmx => .mm, + .sse => switch (reg.bitSize()) { + 128 => .xmm, + 256 => .ymm, + else => unreachable, + }, }, .mem => |mem| switch (mem) { - .moffs => return .moffs, - .sib, .rip => { - const bit_size = mem.bitSize(); - return switch (bit_size) { - 8 => .m8, - 16 => .m16, - 32 => .m32, - 64 => .m64, - 80 => .m80, - 128 => .m128, - 256 => .m256, - else => unreachable, - }; + .moffs => .moffs, + .sib, .rip => switch (mem.bitSize()) { + 8 => .m8, + 16 => .m16, + 32 => .m32, + 64 => .m64, + 80 => .m80, + 128 => .m128, + 256 => .m256, + else => unreachable, }, }, - .imm => |imm| { - switch (imm) { - .signed => |x| { - if (x == 1) return .unity; - if (math.cast(i8, x)) |_| return .imm8s; - if (math.cast(i16, x)) |_| return .imm16s; - return .imm32s; - }, - .unsigned => |x| { - if (x == 1) return .unity; - if (math.cast(i8, x)) |_| return .imm8s; - if (math.cast(u8, x)) |_| return .imm8; - if (math.cast(i16, x)) |_| return .imm16s; - if (math.cast(u16, x)) |_| return .imm16; - if (math.cast(i32, x)) |_| return .imm32s; - if (math.cast(u32, x)) |_| return .imm32; - return .imm64; - }, - } + .imm => |imm| switch (imm) { + .signed => |x| if (x == 1) + .unity + else if (math.cast(i8, x)) |_| + .imm8s + else if (math.cast(i16, x)) |_| + .imm16s + else + .imm32s, + .unsigned => |x| if (x == 1) + .unity + else if (math.cast(i8, x)) |_| + .imm8s + else if (math.cast(u8, x)) |_| + .imm8 + else if (math.cast(i16, x)) |_| + .imm16s + else if (math.cast(u16, x)) |_| + .imm16 + else if (math.cast(i32, x)) |_| + .imm32s + else if (math.cast(u32, x)) |_| + .imm32 + else + .imm64, }, - } + }; } pub fn immBitSize(op: Op) u64 { @@ -460,6 +470,7 @@ pub const Op = enum { .ax, .r16, .rm16 => unreachable, .eax, .r32, .rm32, .r32_m16 => unreachable, .rax, .r64, .rm64, .r64_m16 => unreachable, + .st, .mm, .mm_m64 => unreachable, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable, .ymm, .ymm_m256 => unreachable, .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, @@ -480,7 +491,8 @@ pub const Op = enum { .al, .cl, .r8, .rm8 => 8, .ax, .r16, .rm16 => 16, .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32, - .rax, .r64, .rm64, .r64_m16 => 64, + .rax, .r64, .rm64, .r64_m16, .mm, .mm_m64 => 64, + .st => 80, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, .ymm, .ymm_m256 => 256, }; @@ -491,11 +503,11 @@ pub const Op = enum { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, - .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable, + .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .st, .mm, .xmm, .ymm => unreachable, .m8, .rm8, .r32_m8 => 8, .m16, .rm16, .r32_m16, .r64_m16 => 16, .m32, .rm32, .xmm_m32 => 32, - .m64, .rm64, .xmm_m64 => 64, + .m64, .rm64, .mm_m64, .xmm_m64 => 64, .m80 => 80, .m128, .xmm_m128 => 128, .m256, .ymm_m256 => 256, @@ -522,6 +534,7 @@ pub const Op = enum { .r8, .r16, .r32, .r64, .rm8, .rm16, .rm32, .rm64, .r32_m8, .r32_m16, .r64_m16, + .st, .mm, .mm_m64, .xmm, .xmm_m32, .xmm_m64, .xmm_m128, .ymm, .ymm_m256, => true, @@ -550,6 +563,7 @@ pub const Op = enum { .r32_m8, .r32_m16, .r64_m16, .m8, .m16, .m32, .m64, .m80, .m128, .m256, .m, + .mm_m64, .xmm_m32, .xmm_m64, .xmm_m128, .ymm_m256, => true, @@ -573,8 +587,10 @@ pub const Op = enum { .rm8, .rm16, .rm32, .rm64 => .general_purpose, .r32_m8, .r32_m16, .r64_m16 => .general_purpose, .sreg => .segment, - .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point, - .ymm, .ymm_m256 => .floating_point, + .st => .x87, + .mm, .mm_m64 => .mmx, + .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .sse, + .ymm, .ymm_m256 => .sse, }; } @@ -695,6 +711,7 @@ pub const Feature = enum { f16c, fma, lzcnt, + movbe, popcnt, sse, sse2, @@ -717,7 +734,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op } const mnemonic_to_encodings_map = init: { - @setEvalBranchQuota(20_000); + @setEvalBranchQuota(25_000); const encodings = @import("encodings.zig"); var entries = encodings.table; std.sort.sort(encodings.Entry, &entries, {}, struct { diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index f26bf97e82..ef8bbe07b3 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -439,8 +439,21 @@ pub const Inst = struct { /// Bitwise logical and not of packed single-precision floating-point values /// Bitwise logical and not of packed double-precision floating-point values andn, + /// Convert packed doubleword integers to packed single-precision floating-point values + /// Convert packed doubleword integers to packed double-precision floating-point values + cvtpi2, + /// Convert packed single-precision floating-point values to packed doubleword integers + cvtps2pi, /// Convert doubleword integer to scalar single-precision floating-point value - cvtsi2ss, + /// Convert doubleword integer to scalar double-precision floating-point value + cvtsi2, + /// Convert scalar single-precision floating-point value to doubleword integer + cvtss2si, + /// Convert with truncation packed single-precision floating-point values to packed doubleword integers + cvttps2pi, + /// Convert with truncation scalar single-precision floating-point value to doubleword integer + cvttss2si, + /// Maximum of packed single-precision floating-point values /// Maximum of scalar single-precision floating-point values /// Maximum of packed double-precision floating-point values @@ -486,12 +499,33 @@ pub const Inst = struct { /// Unpack and interleave low packed double-precision floating-point values unpckl, + /// Convert packed doubleword integers to packed single-precision floating-point values + /// Convert packed doubleword integers to packed double-precision floating-point values + cvtdq2, + /// Convert packed double-precision floating-point values to packed doubleword integers + cvtpd2dq, + /// Convert packed double-precision floating-point values to packed doubleword integers + cvtpd2pi, + /// Convert packed double-precision floating-point values to packed single-precision floating-point values + cvtpd2, + /// Convert packed single-precision floating-point values to packed doubleword integers + cvtps2dq, + /// Convert packed single-precision floating-point values to packed double-precision floating-point values + cvtps2, + /// Convert scalar double-precision floating-point value to doubleword integer + cvtsd2si, /// Convert scalar double-precision floating-point value to scalar single-precision floating-point value - cvtsd2ss, - /// Convert doubleword integer to scalar double-precision floating-point value - cvtsi2sd, + cvtsd2, /// Convert scalar single-precision floating-point value to scalar double-precision floating-point value - cvtss2sd, + cvtss2, + /// Convert with truncation packed double-precision floating-point values to packed doubleword integers + cvttpd2dq, + /// Convert with truncation packed double-precision floating-point values to packed doubleword integers + cvttpd2pi, + /// Convert with truncation packed single-precision floating-point values to packed doubleword integers + cvttps2dq, + /// Convert with truncation scalar double-precision floating-point value to doubleword integer + cvttsd2si, /// Packed interleave shuffle of quadruplets of single-precision floating-point values /// Packed interleave shuffle of pairs of double-precision floating-point values shuf, @@ -542,7 +576,7 @@ pub const Inst = struct { broadcast, /// Convert 16-bit floating-point values to single-precision floating-point values - cvtph2ps, + cvtph2, /// Convert single-precision floating-point values to 16-bit floating-point values cvtps2ph, diff --git a/src/arch/x86_64/bits.zig b/src/arch/x86_64/bits.zig index 3343f280b9..923ba31266 100644 --- a/src/arch/x86_64/bits.zig +++ b/src/arch/x86_64/bits.zig @@ -175,15 +175,21 @@ pub const Register = enum(u7) { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, + mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7, + + st0, st1, st2, st3, st4, st5, st6, st7, + es, cs, ss, ds, fs, gs, none, // zig fmt: on - pub const Class = enum(u2) { + pub const Class = enum { general_purpose, - floating_point, segment, + x87, + mmx, + sse, }; pub fn class(reg: Register) Class { @@ -195,8 +201,10 @@ pub const Register = enum(u7) { @enumToInt(Register.al) ... @enumToInt(Register.r15b) => .general_purpose, @enumToInt(Register.ah) ... @enumToInt(Register.bh) => .general_purpose, - @enumToInt(Register.ymm0) ... @enumToInt(Register.ymm15) => .floating_point, - @enumToInt(Register.xmm0) ... @enumToInt(Register.xmm15) => .floating_point, + @enumToInt(Register.ymm0) ... @enumToInt(Register.ymm15) => .sse, + @enumToInt(Register.xmm0) ... @enumToInt(Register.xmm15) => .sse, + @enumToInt(Register.mm0) ... @enumToInt(Register.mm7) => .mmx, + @enumToInt(Register.st0) ... @enumToInt(Register.st7) => .x87, @enumToInt(Register.es) ... @enumToInt(Register.gs) => .segment, @@ -216,8 +224,10 @@ pub const Register = enum(u7) { @enumToInt(Register.ymm0) ... @enumToInt(Register.ymm15) => @enumToInt(Register.ymm0) - 16, @enumToInt(Register.xmm0) ... @enumToInt(Register.xmm15) => @enumToInt(Register.xmm0) - 16, + @enumToInt(Register.mm0) ... @enumToInt(Register.mm7) => @enumToInt(Register.mm0) - 32, + @enumToInt(Register.st0) ... @enumToInt(Register.st7) => @enumToInt(Register.st0) - 40, - @enumToInt(Register.es) ... @enumToInt(Register.gs) => @enumToInt(Register.es) - 32, + @enumToInt(Register.es) ... @enumToInt(Register.gs) => @enumToInt(Register.es) - 48, else => unreachable, // zig fmt: on @@ -236,6 +246,8 @@ pub const Register = enum(u7) { @enumToInt(Register.ymm0) ... @enumToInt(Register.ymm15) => 256, @enumToInt(Register.xmm0) ... @enumToInt(Register.xmm15) => 128, + @enumToInt(Register.mm0) ... @enumToInt(Register.mm7) => 64, + @enumToInt(Register.st0) ... @enumToInt(Register.st7) => 80, @enumToInt(Register.es) ... @enumToInt(Register.gs) => 16, @@ -271,6 +283,8 @@ pub const Register = enum(u7) { @enumToInt(Register.ymm0) ... @enumToInt(Register.ymm15) => @enumToInt(Register.ymm0), @enumToInt(Register.xmm0) ... @enumToInt(Register.xmm15) => @enumToInt(Register.xmm0), + @enumToInt(Register.mm0) ... @enumToInt(Register.mm7) => @enumToInt(Register.mm0), + @enumToInt(Register.st0) ... @enumToInt(Register.st7) => @enumToInt(Register.st0), @enumToInt(Register.es) ... @enumToInt(Register.gs) => @enumToInt(Register.es), @@ -326,8 +340,8 @@ pub const Register = enum(u7) { return @intToEnum(Register, @enumToInt(reg) - reg.gpBase() + @enumToInt(Register.al)); } - fn fpBase(reg: Register) u7 { - assert(reg.class() == .floating_point); + fn sseBase(reg: Register) u7 { + assert(reg.class() == .sse); return switch (@enumToInt(reg)) { @enumToInt(Register.ymm0)...@enumToInt(Register.ymm15) => @enumToInt(Register.ymm0), @enumToInt(Register.xmm0)...@enumToInt(Register.xmm15) => @enumToInt(Register.xmm0), @@ -336,49 +350,24 @@ pub const Register = enum(u7) { } pub fn to256(reg: Register) Register { - return @intToEnum(Register, @enumToInt(reg) - reg.fpBase() + @enumToInt(Register.ymm0)); + return @intToEnum(Register, @enumToInt(reg) - reg.sseBase() + @enumToInt(Register.ymm0)); } pub fn to128(reg: Register) Register { - return @intToEnum(Register, @enumToInt(reg) - reg.fpBase() + @enumToInt(Register.xmm0)); - } - - pub fn dwarfLocOp(reg: Register) u8 { - return switch (reg.class()) { - .general_purpose => switch (reg.to64()) { - .rax => DW.OP.reg0, - .rdx => DW.OP.reg1, - .rcx => DW.OP.reg2, - .rbx => DW.OP.reg3, - .rsi => DW.OP.reg4, - .rdi => DW.OP.reg5, - .rbp => DW.OP.reg6, - .rsp => DW.OP.reg7, - else => @intCast(u8, @enumToInt(reg) - reg.gpBase()) + DW.OP.reg0, - }, - .floating_point => @intCast(u8, @enumToInt(reg) - reg.fpBase()) + DW.OP.reg17, - else => unreachable, - }; + return @intToEnum(Register, @enumToInt(reg) - reg.sseBase() + @enumToInt(Register.xmm0)); } - /// DWARF encodings that push a value onto the DWARF stack that is either - /// the contents of a register or the result of adding the contents a given - /// register to a given signed offset. - pub fn dwarfLocOpDeref(reg: Register) u8 { + /// DWARF register encoding + pub fn dwarfNum(reg: Register) u6 { return switch (reg.class()) { - .general_purpose => switch (reg.to64()) { - .rax => DW.OP.breg0, - .rdx => DW.OP.breg1, - .rcx => DW.OP.breg2, - .rbx => DW.OP.breg3, - .rsi => DW.OP.breg4, - .rdi => DW.OP.breg5, - .rbp => DW.OP.breg6, - .rsp => DW.OP.breg7, - else => @intCast(u8, @enumToInt(reg) - reg.gpBase()) + DW.OP.breg0, - }, - .floating_point => @intCast(u8, @enumToInt(reg) - reg.fpBase()) + DW.OP.breg17, - else => unreachable, + .general_purpose => if (reg.isExtended()) + reg.enc() + else + @truncate(u3, @as(u24, 0o54673120) >> @as(u5, reg.enc()) * 3), + .sse => 17 + @as(u6, reg.enc()), + .x87 => 33 + @as(u6, reg.enc()), + .mmx => 41 + @as(u6, reg.enc()), + .segment => 50 + @as(u6, reg.enc()), }; } }; @@ -392,6 +381,8 @@ test "Register id - different classes" { try expect(Register.ymm0.id() == 0b10000); try expect(Register.ymm0.id() != Register.rax.id()); try expect(Register.xmm0.id() == Register.ymm0.id()); + try expect(Register.xmm0.id() != Register.mm0.id()); + try expect(Register.mm0.id() != Register.st0.id()); try expect(Register.es.id() == 0b100000); } @@ -407,7 +398,9 @@ test "Register enc - different classes" { test "Register classes" { try expect(Register.r11.class() == .general_purpose); - try expect(Register.ymm11.class() == .floating_point); + try expect(Register.ymm11.class() == .sse); + try expect(Register.mm3.class() == .mmx); + try expect(Register.st3.class() == .x87); try expect(Register.fs.class() == .segment); } diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index a7a50867c3..3383315bd6 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -272,14 +272,6 @@ pub const table = [_]Entry{ .{ .div, .m, &.{ .rm32 }, &.{ 0xf7 }, 6, .none, .none }, .{ .div, .m, &.{ .rm64 }, &.{ 0xf7 }, 6, .long, .none }, - .{ .fisttp, .m, &.{ .m16 }, &.{ 0xdf }, 1, .none, .x87 }, - .{ .fisttp, .m, &.{ .m32 }, &.{ 0xdb }, 1, .none, .x87 }, - .{ .fisttp, .m, &.{ .m64 }, &.{ 0xdd }, 1, .none, .x87 }, - - .{ .fld, .m, &.{ .m32 }, &.{ 0xd9 }, 0, .none, .x87 }, - .{ .fld, .m, &.{ .m64 }, &.{ 0xdd }, 0, .none, .x87 }, - .{ .fld, .m, &.{ .m80 }, &.{ 0xdb }, 5, .none, .x87 }, - .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .none, .none }, .{ .idiv, .m, &.{ .rm8 }, &.{ 0xf6 }, 7, .rex, .none }, .{ .idiv, .m, &.{ .rm16 }, &.{ 0xf7 }, 7, .short, .none }, @@ -395,12 +387,12 @@ pub const table = [_]Entry{ .{ .mov, .mi, &.{ .rm32, .imm32 }, &.{ 0xc7 }, 0, .none, .none }, .{ .mov, .mi, &.{ .rm64, .imm32s }, &.{ 0xc7 }, 0, .long, .none }, - .{ .movbe, .rm, &.{ .r16, .m16 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .short, .none }, - .{ .movbe, .rm, &.{ .r32, .m32 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .none }, - .{ .movbe, .rm, &.{ .r64, .m64 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .long, .none }, - .{ .movbe, .mr, &.{ .m16, .r16 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .short, .none }, - .{ .movbe, .mr, &.{ .m32, .r32 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .none }, - .{ .movbe, .mr, &.{ .m64, .r64 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .long, .none }, + .{ .movbe, .rm, &.{ .r16, .m16 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .short, .movbe }, + .{ .movbe, .rm, &.{ .r32, .m32 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .none, .movbe }, + .{ .movbe, .rm, &.{ .r64, .m64 }, &.{ 0x0f, 0x38, 0xf0 }, 0, .long, .movbe }, + .{ .movbe, .mr, &.{ .m16, .r16 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .short, .movbe }, + .{ .movbe, .mr, &.{ .m32, .r32 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .none, .movbe }, + .{ .movbe, .mr, &.{ .m64, .r64 }, &.{ 0x0f, 0x38, 0xf1 }, 0, .long, .movbe }, .{ .movs, .np, &.{ .m8, .m8 }, &.{ 0xa4 }, 0, .none, .none }, .{ .movs, .np, &.{ .m16, .m16 }, &.{ 0xa5 }, 0, .short, .none }, @@ -836,6 +828,15 @@ pub const table = [_]Entry{ .{ .xor, .rm, &.{ .r32, .rm32 }, &.{ 0x33 }, 0, .none, .none }, .{ .xor, .rm, &.{ .r64, .rm64 }, &.{ 0x33 }, 0, .long, .none }, + // X87 + .{ .fisttp, .m, &.{ .m16 }, &.{ 0xdf }, 1, .none, .x87 }, + .{ .fisttp, .m, &.{ .m32 }, &.{ 0xdb }, 1, .none, .x87 }, + .{ .fisttp, .m, &.{ .m64 }, &.{ 0xdd }, 1, .none, .x87 }, + + .{ .fld, .m, &.{ .m32 }, &.{ 0xd9 }, 0, .none, .x87 }, + .{ .fld, .m, &.{ .m64 }, &.{ 0xdd }, 0, .none, .x87 }, + .{ .fld, .m, &.{ .m80 }, &.{ 0xdb }, 5, .none, .x87 }, + // SSE .{ .addps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x58 }, 0, .none, .sse }, @@ -847,9 +848,21 @@ pub const table = [_]Entry{ .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .none, .sse }, + .{ .cvtpi2ps, .rm, &.{ .xmm, .mm_m64 }, &.{ 0x0f, 0x2a }, 0, .none, .sse }, + + .{ .cvtps2pi, .rm, &.{ .mm, .xmm_m64 }, &.{ 0x0f, 0x2d }, 0, .none, .sse }, + .{ .cvtsi2ss, .rm, &.{ .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .none, .sse }, .{ .cvtsi2ss, .rm, &.{ .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .long, .sse }, + .{ .cvtss2si, .rm, &.{ .r32, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2d }, 0, .none, .sse }, + .{ .cvtss2si, .rm, &.{ .r64, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2d }, 0, .long, .sse }, + + .{ .cvttps2pi, .rm, &.{ .mm, .xmm_m64 }, &.{ 0x0f, 0x2c }, 0, .none, .sse }, + + .{ .cvttss2si, .rm, &.{ .r32, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2c }, 0, .none, .sse }, + .{ .cvttss2si, .rm, &.{ .r64, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2c }, 0, .long, .sse }, + .{ .divps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5e }, 0, .none, .sse }, .{ .divss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5e }, 0, .none, .sse }, @@ -906,6 +919,25 @@ pub const table = [_]Entry{ .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .none, .sse2 }, + .{ .cvtdq2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .none, .sse2 }, + + .{ .cvtdq2ps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5b }, 0, .none, .sse2 }, + + .{ .cvtpd2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf2, 0x0f, 0xe6 }, 0, .none, .sse2 }, + + .{ .cvtpd2pi, .rm, &.{ .mm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x2d }, 0, .none, .sse2 }, + + .{ .cvtpd2ps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5a }, 0, .none, .sse2 }, + + .{ .cvtpi2pd, .rm, &.{ .xmm, .mm_m64 }, &.{ 0x66, 0x0f, 0x2a }, 0, .none, .sse2 }, + + .{ .cvtps2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5b }, 0, .none, .sse2 }, + + .{ .cvtps2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x0f, 0x5a }, 0, .none, .sse2 }, + + .{ .cvtsd2si, .rm, &.{ .r32, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2d }, 0, .none, .sse2 }, + .{ .cvtsd2si, .rm, &.{ .r64, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2d }, 0, .long, .sse2 }, + .{ .cvtsd2ss, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .none, .sse2 }, .{ .cvtsi2sd, .rm, &.{ .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .none, .sse2 }, @@ -913,6 +945,15 @@ pub const table = [_]Entry{ .{ .cvtss2sd, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .none, .sse2 }, + .{ .cvttpd2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe6 }, 0, .none, .sse2 }, + + .{ .cvttpd2pi, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x2c }, 0, .none, .sse2 }, + + .{ .cvttps2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x5b }, 0, .none, .sse2 }, + + .{ .cvttsd2si, .rm, &.{ .r32, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2c }, 0, .none, .sse2 }, + .{ .cvttsd2si, .rm, &.{ .r64, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2c }, 0, .long, .sse2 }, + .{ .divpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .none, .sse2 }, .{ .divsd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5e }, 0, .none, .sse2 }, @@ -1034,15 +1075,51 @@ pub const table = [_]Entry{ .{ .vbroadcastsd, .rm, &.{ .ymm, .m64 }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx }, .{ .vbroadcastf128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x1a }, 0, .vex_256_w0, .avx }, + .{ .vcvtdq2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .vex_128_wig, .avx }, + .{ .vcvtdq2pd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .vex_256_wig, .avx }, + + .{ .vcvtdq2ps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x5b }, 0, .vex_128_wig, .avx }, + .{ .vcvtdq2ps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x5b }, 0, .vex_256_wig, .avx }, + + .{ .vcvtpd2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf2, 0x0f, 0xe6 }, 0, .vex_128_wig, .avx }, + .{ .vcvtpd2dq, .rm, &.{ .xmm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0xe6 }, 0, .vex_256_wig, .avx }, + + .{ .vcvtpd2ps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5a }, 0, .vex_128_wig, .avx }, + .{ .vcvtpd2ps, .rm, &.{ .xmm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5a }, 0, .vex_256_wig, .avx }, + + .{ .vcvtps2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5b }, 0, .vex_128_wig, .avx }, + .{ .vcvtps2dq, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5b }, 0, .vex_256_wig, .avx }, + + .{ .vcvtps2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x0f, 0x5a }, 0, .vex_128_wig, .avx }, + .{ .vcvtps2pd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x0f, 0x5a }, 0, .vex_256_wig, .avx }, + + .{ .vcvtsd2si, .rm, &.{ .r32, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2d }, 0, .vex_lig_w0, .sse2 }, + .{ .vcvtsd2si, .rm, &.{ .r64, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2d }, 0, .vex_lig_w1, .sse2 }, + .{ .vcvtsd2ss, .rvm, &.{ .xmm, .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, .{ .vcvtsi2sd, .rvm, &.{ .xmm, .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w1, .avx }, - .{ .vcvtsi2ss, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, - .{ .vcvtsi2ss, .rvm, &.{ .xmm, .xmm, .rm64 }, &.{ 0xf2, 0x0f, 0x2a }, 0, .vex_lig_w1, .avx }, + .{ .vcvtsi2ss, .rvm, &.{ .xmm, .xmm, .rm32 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .vex_lig_w0, .avx }, + .{ .vcvtsi2ss, .rvm, &.{ .xmm, .xmm, .rm64 }, &.{ 0xf3, 0x0f, 0x2a }, 0, .vex_lig_w1, .avx }, + + .{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, + + .{ .vcvtss2si, .rm, &.{ .r32, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2d }, 0, .vex_lig_w0, .avx }, + .{ .vcvtss2si, .rm, &.{ .r64, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2d }, 0, .vex_lig_w1, .avx }, + + .{ .vcvttpd2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe6 }, 0, .vex_128_wig, .avx }, + .{ .vcvttpd2dq, .rm, &.{ .xmm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe6 }, 0, .vex_256_wig, .avx }, + + .{ .vcvttps2dq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x5b }, 0, .vex_128_wig, .avx }, + .{ .vcvttps2dq, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf3, 0x0f, 0x5b }, 0, .vex_256_wig, .avx }, + + .{ .vcvttsd2si, .rm, &.{ .r32, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2c }, 0, .vex_lig_w0, .sse2 }, + .{ .vcvttsd2si, .rm, &.{ .r64, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x2c }, 0, .vex_lig_w1, .sse2 }, - .{ .vcvtss2sd, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf2, 0x0f, 0x5a }, 0, .vex_lig_wig, .avx }, + .{ .vcvttss2si, .rm, &.{ .r32, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2c }, 0, .vex_lig_w0, .avx }, + .{ .vcvttss2si, .rm, &.{ .r64, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x2c }, 0, .vex_lig_w1, .avx }, .{ .vdivpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_128_wig, .avx }, .{ .vdivpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x5e }, 0, .vex_256_wig, .avx }, diff --git a/src/link/Dwarf.zig b/src/link/Dwarf.zig index c134f60316..1a064049fc 100644 --- a/src/link/Dwarf.zig +++ b/src/link/Dwarf.zig @@ -608,23 +608,44 @@ pub const DeclState = struct { switch (loc) { .register => |reg| { - try dbg_info.ensureUnusedCapacity(3); + try dbg_info.ensureUnusedCapacity(4); dbg_info.appendAssumeCapacity(@enumToInt(AbbrevKind.parameter)); - dbg_info.appendSliceAssumeCapacity(&[2]u8{ // DW.AT.location, DW.FORM.exprloc - 1, // ULEB128 dwarf expression length - reg, - }); + // DW.AT.location, DW.FORM.exprloc + var expr_len = std.io.countingWriter(std.io.null_writer); + if (reg < 32) { + expr_len.writer().writeByte(DW.OP.reg0 + reg) catch unreachable; + } else { + expr_len.writer().writeByte(DW.OP.regx) catch unreachable; + leb128.writeULEB128(expr_len.writer(), reg) catch unreachable; + } + leb128.writeULEB128(dbg_info.writer(), expr_len.bytes_written) catch unreachable; + if (reg < 32) { + dbg_info.appendAssumeCapacity(DW.OP.reg0 + reg); + } else { + dbg_info.appendAssumeCapacity(DW.OP.regx); + leb128.writeULEB128(dbg_info.writer(), reg) catch unreachable; + } }, .stack => |info| { - try dbg_info.ensureUnusedCapacity(8); + try dbg_info.ensureUnusedCapacity(9); dbg_info.appendAssumeCapacity(@enumToInt(AbbrevKind.parameter)); - const fixup = dbg_info.items.len; - dbg_info.appendSliceAssumeCapacity(&[2]u8{ // DW.AT.location, DW.FORM.exprloc - 1, // we will backpatch it after we encode the displacement in LEB128 - info.fp_register, // frame pointer - }); + // DW.AT.location, DW.FORM.exprloc + var expr_len = std.io.countingWriter(std.io.null_writer); + if (info.fp_register < 32) { + expr_len.writer().writeByte(DW.OP.breg0 + info.fp_register) catch unreachable; + } else { + expr_len.writer().writeByte(DW.OP.bregx) catch unreachable; + leb128.writeULEB128(expr_len.writer(), info.fp_register) catch unreachable; + } + leb128.writeILEB128(expr_len.writer(), info.offset) catch unreachable; + leb128.writeULEB128(dbg_info.writer(), expr_len.bytes_written) catch unreachable; + if (info.fp_register < 32) { + dbg_info.appendAssumeCapacity(DW.OP.breg0 + info.fp_register); + } else { + dbg_info.appendAssumeCapacity(DW.OP.bregx); + leb128.writeULEB128(dbg_info.writer(), info.fp_register) catch unreachable; + } leb128.writeILEB128(dbg_info.writer(), info.offset) catch unreachable; - dbg_info.items[fixup] += @intCast(u8, dbg_info.items.len - fixup - 2); }, .wasm_local => |value| { const leb_size = link.File.Wasm.getULEB128Size(value); @@ -670,22 +691,45 @@ pub const DeclState = struct { switch (loc) { .register => |reg| { - try dbg_info.ensureUnusedCapacity(2); - dbg_info.appendSliceAssumeCapacity(&[2]u8{ // DW.AT.location, DW.FORM.exprloc - 1, // ULEB128 dwarf expression length - reg, - }); + try dbg_info.ensureUnusedCapacity(4); + dbg_info.appendAssumeCapacity(@enumToInt(AbbrevKind.parameter)); + // DW.AT.location, DW.FORM.exprloc + var expr_len = std.io.countingWriter(std.io.null_writer); + if (reg < 32) { + expr_len.writer().writeByte(DW.OP.reg0 + reg) catch unreachable; + } else { + expr_len.writer().writeByte(DW.OP.regx) catch unreachable; + leb128.writeULEB128(expr_len.writer(), reg) catch unreachable; + } + leb128.writeULEB128(dbg_info.writer(), expr_len.bytes_written) catch unreachable; + if (reg < 32) { + dbg_info.appendAssumeCapacity(DW.OP.reg0 + reg); + } else { + dbg_info.appendAssumeCapacity(DW.OP.regx); + leb128.writeULEB128(dbg_info.writer(), reg) catch unreachable; + } }, .stack => |info| { - try dbg_info.ensureUnusedCapacity(7); - const fixup = dbg_info.items.len; - dbg_info.appendSliceAssumeCapacity(&[2]u8{ // DW.AT.location, DW.FORM.exprloc - 1, // we will backpatch it after we encode the displacement in LEB128 - info.fp_register, - }); + try dbg_info.ensureUnusedCapacity(9); + dbg_info.appendAssumeCapacity(@enumToInt(AbbrevKind.parameter)); + // DW.AT.location, DW.FORM.exprloc + var expr_len = std.io.countingWriter(std.io.null_writer); + if (info.fp_register < 32) { + expr_len.writer().writeByte(DW.OP.breg0 + info.fp_register) catch unreachable; + } else { + expr_len.writer().writeByte(DW.OP.bregx) catch unreachable; + leb128.writeULEB128(expr_len.writer(), info.fp_register) catch unreachable; + } + leb128.writeILEB128(expr_len.writer(), info.offset) catch unreachable; + leb128.writeULEB128(dbg_info.writer(), expr_len.bytes_written) catch unreachable; + if (info.fp_register < 32) { + dbg_info.appendAssumeCapacity(DW.OP.breg0 + info.fp_register); + } else { + dbg_info.appendAssumeCapacity(DW.OP.bregx); + leb128.writeULEB128(dbg_info.writer(), info.fp_register) catch unreachable; + } leb128.writeILEB128(dbg_info.writer(), info.offset) catch unreachable; - dbg_info.items[fixup] += @intCast(u8, dbg_info.items.len - fixup - 2); }, .wasm_local => |value| { diff --git a/test/behavior/cast.zig b/test/behavior/cast.zig index 20f84184a0..d6717032ff 100644 --- a/test/behavior/cast.zig +++ b/test/behavior/cast.zig @@ -153,7 +153,6 @@ test "@intToFloat(f80)" { test "@floatToInt" { if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest; -- cgit v1.2.3 From 6c6d8d67cfe14c50684c04a579c1e62bf287e8cb Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Sun, 14 May 2023 05:12:46 -0400 Subject: x86_64: redo movement, float negation, and `@fabs` --- src/arch/x86_64/CodeGen.zig | 357 ++++++++++++++++++++++++----------- src/arch/x86_64/Encoding.zig | 18 +- src/arch/x86_64/Mir.zig | 12 ++ src/arch/x86_64/encodings.zig | 75 +++++++- src/type.zig | 12 +- test/behavior/floatop.zig | 1 - test/behavior/math.zig | 1 - test/behavior/translate_c_macros.zig | 1 - 8 files changed, 359 insertions(+), 118 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index e5c6925596..80f537e046 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -4681,61 +4681,136 @@ fn airBitReverse(self: *Self, inst: Air.Inst.Index) !void { } fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void { + const tag = self.air.instructions.items(.tag)[inst]; const un_op = self.air.instructions.items(.data)[inst].un_op; const ty = self.air.typeOf(un_op); - const ty_bits = ty.floatBits(self.target.*); + const abi_size: u32 = switch (ty.abiSize(self.target.*)) { + 1...16 => 16, + 17...32 => 32, + else => return self.fail("TODO implement airFloatSign for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), + }; + const scalar_bits = ty.scalarType().floatBits(self.target.*); + + const src_mcv = try self.resolveInst(un_op); + const src_lock = if (src_mcv.getReg()) |reg| self.register_manager.lockReg(reg) else null; + defer if (src_lock) |lock| self.register_manager.unlockReg(lock); + + const dst_mcv: MCValue = if (src_mcv.isRegister() and self.reuseOperand(inst, un_op, 0, src_mcv)) + src_mcv + else if (self.hasFeature(.avx)) + .{ .register = try self.register_manager.allocReg(inst, sse) } + else + try self.copyToRegisterWithInstTracking(inst, ty, src_mcv); + const dst_reg = dst_mcv.getReg().?; + const dst_lock = self.register_manager.lockReg(dst_reg); + defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); var arena = std.heap.ArenaAllocator.init(self.gpa); defer arena.deinit(); - const ExpectedContents = union { - f16: Value.Payload.Float_16, - f32: Value.Payload.Float_32, - f64: Value.Payload.Float_64, - f80: Value.Payload.Float_80, - f128: Value.Payload.Float_128, + const ExpectedContents = struct { + scalar: union { + i64: Value.Payload.I64, + big: struct { + limbs: [ + @max( + std.math.big.int.Managed.default_capacity, + std.math.big.int.calcTwosCompLimbCount(128), + ) + ]std.math.big.Limb, + pl: Value.Payload.BigInt, + }, + }, + repeated: Value.Payload.SubValue, }; var stack align(@alignOf(ExpectedContents)) = std.heap.stackFallback(@sizeOf(ExpectedContents), arena.allocator()); + var int_pl = Type.Payload.Bits{ + .base = .{ .tag = .int_signed }, + .data = scalar_bits, + }; var vec_pl = Type.Payload.Array{ .base = .{ .tag = .vector }, .data = .{ - .len = @divExact(128, ty_bits), - .elem_type = ty, + .len = @divExact(abi_size * 8, scalar_bits), + .elem_type = Type.initPayload(&int_pl.base), }, }; const vec_ty = Type.initPayload(&vec_pl.base); - - var sign_pl = Value.Payload.SubValue{ - .base = .{ .tag = .repeated }, - .data = try Value.floatToValue(-0.0, stack.get(), ty, self.target.*), + const sign_val = switch (tag) { + .neg => try vec_ty.minInt(stack.get(), self.target.*), + .fabs => try vec_ty.maxInt(stack.get(), self.target.*), + else => unreachable, }; - const sign_val = Value.initPayload(&sign_pl.base); const sign_mcv = try self.genTypedValue(.{ .ty = vec_ty, .val = sign_val }); - - const src_mcv = try self.resolveInst(un_op); - const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, un_op, 0, src_mcv)) - src_mcv + const sign_mem = if (sign_mcv.isMemory()) + sign_mcv.mem(Memory.PtrSize.fromSize(abi_size)) else - try self.copyToRegisterWithInstTracking(inst, ty, src_mcv); - const dst_lock = self.register_manager.lockReg(dst_mcv.register); - defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); + Memory.sib(Memory.PtrSize.fromSize(abi_size), .{ + .base = .{ .reg = try self.copyToTmpRegister(Type.usize, sign_mcv.address()) }, + }); - const tag = self.air.instructions.items(.tag)[inst]; - try self.genBinOpMir(switch (ty_bits) { - // No point using an extra prefix byte for *pd which performs the same operation. - 16, 32, 64, 128 => switch (tag) { - .neg => .{ ._ps, .xor }, - .fabs => .{ ._ps, .andn }, + if (self.hasFeature(.avx)) try self.asmRegisterRegisterMemory( + switch (scalar_bits) { + 16, 128 => if (abi_size <= 16 or self.hasFeature(.avx2)) switch (tag) { + .neg => .{ .vp_, .xor }, + .fabs => .{ .vp_, .@"and" }, + else => unreachable, + } else switch (tag) { + .neg => .{ .v_ps, .xor }, + .fabs => .{ .v_ps, .@"and" }, + else => unreachable, + }, + 32 => switch (tag) { + .neg => .{ .v_ps, .xor }, + .fabs => .{ .v_ps, .@"and" }, + else => unreachable, + }, + 64 => switch (tag) { + .neg => .{ .v_pd, .xor }, + .fabs => .{ .v_pd, .@"and" }, + else => unreachable, + }, + 80 => return self.fail("TODO implement airFloatSign for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), else => unreachable, }, - 80 => return self.fail("TODO implement airFloatSign for {}", .{ - ty.fmt(self.bin_file.options.module.?), - }), - else => unreachable, - }, vec_ty, dst_mcv, sign_mcv); + registerAlias(dst_reg, abi_size), + registerAlias(if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv), abi_size), + sign_mem, + ) else try self.asmRegisterMemory( + switch (scalar_bits) { + 16, 128 => switch (tag) { + .neg => .{ .p_, .xor }, + .fabs => .{ .p_, .@"and" }, + else => unreachable, + }, + 32 => switch (tag) { + .neg => .{ ._ps, .xor }, + .fabs => .{ ._ps, .@"and" }, + else => unreachable, + }, + 64 => switch (tag) { + .neg => .{ ._pd, .xor }, + .fabs => .{ ._pd, .@"and" }, + else => unreachable, + }, + 80 => return self.fail("TODO implement airFloatSign for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), + else => unreachable, + }, + registerAlias(dst_reg, abi_size), + sign_mem, + ); return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none }); } @@ -8593,7 +8668,6 @@ const MoveStrategy = union(enum) { const InsertExtract = struct { insert: Mir.Inst.FixedTag, extract: Mir.Inst.FixedTag, - imm: Immediate, }; }; fn moveStrategy(self: *Self, ty: Type, aligned: bool) !MoveStrategy { @@ -8603,17 +8677,15 @@ fn moveStrategy(self: *Self, ty: Type, aligned: bool) !MoveStrategy { 16 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ .insert = .{ .vp_w, .insr }, .extract = .{ .vp_w, .extr }, - .imm = Immediate.u(0), } } else .{ .insert_extract = .{ .insert = .{ .p_w, .insr }, .extract = .{ .p_w, .extr }, - .imm = Immediate.u(0), } }, 32 => return .{ .move = if (self.hasFeature(.avx)) .{ .v_ss, .mov } else .{ ._ss, .mov } }, 64 => return .{ .move = if (self.hasFeature(.avx)) .{ .v_sd, .mov } else .{ ._sd, .mov } }, 128 => return .{ .move = if (self.hasFeature(.avx)) - if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } - else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, else => {}, }, .Vector => switch (ty.childType().zigTypeTag()) { @@ -8622,101 +8694,120 @@ fn moveStrategy(self: *Self, ty: Type, aligned: bool) !MoveStrategy { 1 => if (self.hasFeature(.avx)) return .{ .vex_insert_extract = .{ .insert = .{ .vp_b, .insr }, .extract = .{ .vp_b, .extr }, - .imm = Immediate.u(0), } } else if (self.hasFeature(.sse4_2)) return .{ .insert_extract = .{ .insert = .{ .p_b, .insr }, .extract = .{ .p_b, .extr }, - .imm = Immediate.u(0), } }, 2 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ .insert = .{ .vp_w, .insr }, .extract = .{ .vp_w, .extr }, - .imm = Immediate.u(0), } } else .{ .insert_extract = .{ .insert = .{ .p_w, .insr }, .extract = .{ .p_w, .extr }, - .imm = Immediate.u(0), } }, 3...4 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_ss, .mov } + .{ .v_d, .mov } else - .{ ._ss, .mov } }, + .{ ._d, .mov } }, 5...8 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } + .{ .v_q, .mov } else - .{ ._sd, .mov } }, + .{ ._q, .mov } }, + 9...16 => return .{ .move = if (self.hasFeature(.avx)) + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, + 17...32 => if (self.hasFeature(.avx)) + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, else => {}, }, 16 => switch (ty.vectorLen()) { 1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ .insert = .{ .vp_w, .insr }, .extract = .{ .vp_w, .extr }, - .imm = Immediate.u(0), } } else .{ .insert_extract = .{ .insert = .{ .p_w, .insr }, .extract = .{ .p_w, .extr }, - .imm = Immediate.u(0), } }, 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_ss, .mov } + .{ .v_d, .mov } else - .{ ._ss, .mov } }, + .{ ._d, .mov } }, 3...4 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } + .{ .v_q, .mov } else - .{ ._sd, .mov } }, + .{ ._q, .mov } }, 5...8 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_ps, .mov } - else - .{ ._ps, .mov } }, + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, + 9...16 => if (self.hasFeature(.avx)) + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, else => {}, }, 32 => switch (ty.vectorLen()) { 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_ss, .mov } + .{ .v_d, .mov } else - .{ ._ss, .mov } }, + .{ ._d, .mov } }, 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } + .{ .v_q, .mov } else - .{ ._sd, .mov } }, + .{ ._q, .mov } }, 3...4 => return .{ .move = if (self.hasFeature(.avx)) - if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } - else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 5...8 => if (self.hasFeature(.avx)) - return .{ .move = if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } }, + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, else => {}, }, 64 => switch (ty.vectorLen()) { 1 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } + .{ .v_q, .mov } else - .{ ._sd, .mov } }, + .{ ._q, .mov } }, 2 => return .{ .move = if (self.hasFeature(.avx)) - if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } - else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 3...4 => if (self.hasFeature(.avx)) - return .{ .move = if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } }, + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, + else => {}, + }, + 128 => switch (ty.vectorLen()) { + 1 => return .{ .move = if (self.hasFeature(.avx)) + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, + 2 => if (self.hasFeature(.avx)) + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, + else => {}, + }, + 256 => switch (ty.vectorLen()) { + 1 => if (self.hasFeature(.avx)) + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, else => {}, }, else => {}, }, .Float => switch (ty.childType().floatBits(self.target.*)) { 16 => switch (ty.vectorLen()) { - 1 => {}, + 1 => return if (self.hasFeature(.avx)) .{ .vex_insert_extract = .{ + .insert = .{ .vp_w, .insr }, + .extract = .{ .vp_w, .extr }, + } } else .{ .insert_extract = .{ + .insert = .{ .p_w, .insr }, + .extract = .{ .p_w, .extr }, + } }, 2 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_ss, .mov } + .{ .v_d, .mov } else - .{ ._ss, .mov } }, + .{ ._d, .mov } }, 3...4 => return .{ .move = if (self.hasFeature(.avx)) - .{ .v_sd, .mov } + .{ .v_q, .mov } else - .{ ._sd, .mov } }, + .{ ._q, .mov } }, 5...8 => return .{ .move = if (self.hasFeature(.avx)) - if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } - else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 9...16 => if (self.hasFeature(.avx)) - return .{ .move = if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } }, + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, else => {}, }, 32 => switch (ty.vectorLen()) { @@ -8741,18 +8832,18 @@ fn moveStrategy(self: *Self, ty: Type, aligned: bool) !MoveStrategy { else .{ ._sd, .mov } }, 2 => return .{ .move = if (self.hasFeature(.avx)) - if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } - else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, + if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu } + else if (aligned) .{ ._pd, .mova } else .{ ._pd, .movu } }, 3...4 => if (self.hasFeature(.avx)) - return .{ .move = if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } }, + return .{ .move = if (aligned) .{ .v_pd, .mova } else .{ .v_pd, .movu } }, else => {}, }, 128 => switch (ty.vectorLen()) { 1 => return .{ .move = if (self.hasFeature(.avx)) - if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } - else if (aligned) .{ ._ps, .mova } else .{ ._ps, .movu } }, + if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } + else if (aligned) .{ ._, .movdqa } else .{ ._, .movdqu } }, 2 => if (self.hasFeature(.avx)) - return .{ .move = if (aligned) .{ .v_ps, .mova } else .{ .v_ps, .movu } }, + return .{ .move = if (aligned) .{ .v_, .movdqa } else .{ .v_, .movdqu } }, else => {}, }, else => {}, @@ -8860,29 +8951,69 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr ); } }, - .register => |src_reg| if (dst_reg.id() != src_reg.id()) try self.asmRegisterRegister( - if ((dst_reg.class() == .sse) == (src_reg.class() == .sse)) - switch (ty.zigTypeTag()) { - else => .{ ._, .mov }, - .Float, .Vector => .{ ._ps, .mova }, - } - else switch (abi_size) { - 2 => return try self.asmRegisterRegisterImmediate( - if (dst_reg.class() == .sse) .{ .p_w, .insr } else .{ .p_w, .extr }, - registerAlias(dst_reg, 4), - registerAlias(src_reg, 4), - Immediate.u(0), + .register => |src_reg| if (dst_reg.id() != src_reg.id()) switch (dst_reg.class()) { + .general_purpose => switch (src_reg.class()) { + .general_purpose => try self.asmRegisterRegister( + .{ ._, .mov }, + registerAlias(dst_reg, abi_size), + registerAlias(src_reg, abi_size), ), - 4 => .{ ._d, .mov }, - 8 => .{ ._q, .mov }, - else => return self.fail( - "unsupported register copy from {s} to {s}", - .{ @tagName(src_reg), @tagName(dst_reg) }, + .segment => try self.asmRegisterRegister( + .{ ._, .mov }, + registerAlias(dst_reg, abi_size), + src_reg, ), + .sse => try self.asmRegisterRegister( + switch (abi_size) { + 1...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, + 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, + else => unreachable, + }, + registerAlias(dst_reg, @max(abi_size, 4)), + src_reg.to128(), + ), + .x87, .mmx => unreachable, }, - registerAlias(dst_reg, abi_size), - registerAlias(src_reg, abi_size), - ), + .segment => try self.asmRegisterRegister( + .{ ._, .mov }, + dst_reg, + switch (src_reg.class()) { + .general_purpose, .segment => registerAlias(src_reg, abi_size), + .sse => try self.copyToTmpRegister(ty, src_mcv), + .x87, .mmx => unreachable, + }, + ), + .sse => switch (src_reg.class()) { + .general_purpose => try self.asmRegisterRegister( + switch (abi_size) { + 1...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, + 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, + else => unreachable, + }, + dst_reg.to128(), + registerAlias(src_reg, @max(abi_size, 4)), + ), + .segment => try self.genSetReg( + dst_reg, + ty, + .{ .register = try self.copyToTmpRegister(ty, src_mcv) }, + ), + .sse => try self.asmRegisterRegister( + switch (ty.scalarType().zigTypeTag()) { + else => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + .Float => switch (ty.floatBits(self.target.*)) { + else => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + 32 => if (self.hasFeature(.avx)) .{ .v_ps, .mova } else .{ ._ps, .mova }, + 64 => if (self.hasFeature(.avx)) .{ .v_pd, .mova } else .{ ._pd, .mova }, + }, + }, + registerAlias(dst_reg, abi_size), + registerAlias(src_reg, abi_size), + ), + .x87, .mmx => unreachable, + }, + .x87, .mmx => unreachable, + }, .register_offset, .indirect, .load_frame, @@ -8918,14 +9049,14 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr ie.insert, dst_alias, src_mem, - ie.imm, + Immediate.u(0), ), .vex_insert_extract => |ie| try self.asmRegisterRegisterMemoryImmediate( ie.insert, dst_alias, dst_alias, src_mem, - ie.imm, + Immediate.u(0), ), } }, @@ -8947,14 +9078,14 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr ie.insert, dst_alias, src_mem, - ie.imm, + Immediate.u(0), ), .vex_insert_extract => |ie| try self.asmRegisterRegisterMemoryImmediate( ie.insert, dst_alias, dst_alias, src_mem, - ie.imm, + Immediate.u(0), ), } }, @@ -8994,14 +9125,14 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr ie.insert, dst_alias, src_mem, - ie.imm, + Immediate.u(0), ), .vex_insert_extract => |ie| try self.asmRegisterRegisterMemoryImmediate( ie.insert, dst_alias, dst_alias, src_mem, - ie.imm, + Immediate.u(0), ), } }, @@ -9129,7 +9260,7 @@ fn genSetMem(self: *Self, base: Memory.Base, disp: i32, ty: Type, src_mcv: MCVal ie.extract, dst_mem, src_alias, - ie.imm, + Immediate.u(0), ), } }, @@ -10499,7 +10630,7 @@ fn airUnionInit(self: *Self, inst: Air.Inst.Index) !void { if (self.reuseOperand(inst, extra.init, 0, src_mcv)) break :result src_mcv; const dst_mcv = try self.allocRegOrMem(inst, true); - try self.genCopy(src_ty, dst_mcv, src_mcv); + try self.genCopy(union_ty, dst_mcv, src_mcv); break :result dst_mcv; } @@ -11000,7 +11131,15 @@ fn registerAlias(reg: Register, size_bytes: u32) Register { reg.to64() else unreachable, - .segment, .x87, .mmx => unreachable, + .segment => if (size_bytes <= 2) + reg + else + unreachable, + .x87 => unreachable, + .mmx => if (size_bytes <= 8) + reg + else + unreachable, .sse => if (size_bytes <= 16) reg.to128() else if (size_bytes <= 32) diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 66a249a3f2..4014947673 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -261,7 +261,8 @@ pub const Mnemonic = enum { // X87 fisttp, fld, // MMX - movd, + movd, movq, + pand, pandn, por, pxor, // SSE addps, addss, andps, @@ -293,7 +294,8 @@ pub const Mnemonic = enum { maxpd, maxsd, minpd, minsd, movapd, - movq, //movd, movsd, + movdqa, movdqu, + //movsd, movupd, mulpd, mulsd, orpd, @@ -316,6 +318,7 @@ pub const Mnemonic = enum { roundpd, roundps, roundsd, roundss, // AVX vaddpd, vaddps, vaddsd, vaddss, + vandnpd, vandnps, vandpd, vandps, vbroadcastf128, vbroadcastsd, vbroadcastss, vcvtdq2pd, vcvtdq2ps, vcvtpd2dq, vcvtpd2ps, vcvtps2dq, vcvtps2pd, vcvtsd2si, vcvtsd2ss, @@ -327,22 +330,31 @@ pub const Mnemonic = enum { vmaxpd, vmaxps, vmaxsd, vmaxss, vminpd, vminps, vminsd, vminss, vmovapd, vmovaps, - vmovddup, vmovhlps, vmovlhps, + vmovd, + vmovddup, + vmovdqa, vmovdqu, + vmovhlps, vmovlhps, + vmovq, vmovsd, vmovshdup, vmovsldup, vmovss, vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, + vorpd, vorps, + vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, vpinsrb, vpinsrd, vpinsrq, vpinsrw, + vpor, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, + vpxor, vroundpd, vroundps, vroundsd, vroundss, vshufpd, vshufps, vsqrtpd, vsqrtps, vsqrtsd, vsqrtss, vsubpd, vsubps, vsubsd, vsubss, + vxorpd, vxorps, // F16C vcvtph2ps, vcvtps2ph, // FMA diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index ef8bbe07b3..4d1f59e454 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -236,6 +236,14 @@ pub const Inst = struct { /// VEX-Encoded ___ v_, + /// VEX-Encoded ___ Byte + v_b, + /// VEX-Encoded ___ Word + v_w, + /// VEX-Encoded ___ Doubleword + v_d, + /// VEX-Encoded ___ QuadWord + v_q, /// VEX-Encoded Packed ___ vp_, /// VEX-Encoded Packed ___ Byte @@ -526,6 +534,10 @@ pub const Inst = struct { cvttps2dq, /// Convert with truncation scalar double-precision floating-point value to doubleword integer cvttsd2si, + /// Move aligned packed integer values + movdqa, + /// Move unaligned packed integer values + movdqu, /// Packed interleave shuffle of quadruplets of single-precision floating-point values /// Packed interleave shuffle of pairs of double-precision floating-point values shuf, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 3383315bd6..3e57be61ea 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -970,11 +970,16 @@ pub const table = [_]Entry{ .{ .movapd, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x29 }, 0, .none, .sse2 }, .{ .movd, .rm, &.{ .xmm, .rm32 }, &.{ 0x66, 0x0f, 0x6e }, 0, .none, .sse2 }, - .{ .movd, .mr, &.{ .rm32, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .none, .sse2 }, - .{ .movq, .rm, &.{ .xmm, .rm64 }, &.{ 0x66, 0x0f, 0x6e }, 0, .long, .sse2 }, + .{ .movd, .mr, &.{ .rm32, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .none, .sse2 }, .{ .movq, .mr, &.{ .rm64, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .long, .sse2 }, + .{ .movdqa, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6f }, 0, .none, .sse2 }, + .{ .movdqa, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x7f }, 0, .none, .sse2 }, + + .{ .movdqu, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x6f }, 0, .none, .sse2 }, + .{ .movdqu, .mr, &.{ .xmm_m128, .xmm }, &.{ 0xf3, 0x0f, 0x7f }, 0, .none, .sse2 }, + .{ .movq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0x7e }, 0, .none, .sse2 }, .{ .movq, .mr, &.{ .xmm_m64, .xmm }, &.{ 0x66, 0x0f, 0xd6 }, 0, .none, .sse2 }, @@ -987,10 +992,16 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, + .{ .pand, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .none, .sse2 }, + + .{ .pandn, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .none, .sse2 }, + .{ .pextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0xc5 }, 0, .none, .sse2 }, .{ .pinsrw, .rmi, &.{ .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, + .{ .por, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 }, + .{ .pshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .none, .sse2 }, .{ .pshuflw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf2, 0x0f, 0x70 }, 0, .none, .sse2 }, @@ -1012,6 +1023,8 @@ pub const table = [_]Entry{ .{ .punpckldq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .none, .sse2 }, .{ .punpcklqdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .none, .sse2 }, + .{ .pxor, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xef }, 0, .none, .sse2 }, + .{ .shufpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc6 }, 0, .none, .sse2 }, .{ .sqrtpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x51 }, 0, .none, .sse2 }, @@ -1070,6 +1083,18 @@ pub const table = [_]Entry{ .{ .vaddss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x58 }, 0, .vex_lig_wig, .avx }, + .{ .vandnpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x55 }, 0, .vex_128_wig, .avx }, + .{ .vandnpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x55 }, 0, .vex_256_wig, .avx }, + + .{ .vandnps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x55 }, 0, .vex_128_wig, .avx }, + .{ .vandnps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x55 }, 0, .vex_256_wig, .avx }, + + .{ .vandpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x54 }, 0, .vex_128_wig, .avx }, + .{ .vandpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x54 }, 0, .vex_256_wig, .avx }, + + .{ .vandps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .vex_128_wig, .avx }, + .{ .vandps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x54 }, 0, .vex_256_wig, .avx }, + .{ .vbroadcastss, .rm, &.{ .xmm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx }, .{ .vbroadcastss, .rm, &.{ .ymm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx }, .{ .vbroadcastsd, .rm, &.{ .ymm, .m64 }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx }, @@ -1169,13 +1194,31 @@ pub const table = [_]Entry{ .{ .vmovaps, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x0f, 0x28 }, 0, .vex_256_wig, .avx }, .{ .vmovaps, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x0f, 0x29 }, 0, .vex_256_wig, .avx }, + .{ .vmovd, .rm, &.{ .xmm, .rm32 }, &.{ 0x66, 0x0f, 0x6e }, 0, .vex_128_w0, .avx }, + .{ .vmovq, .rm, &.{ .xmm, .rm64 }, &.{ 0x66, 0x0f, 0x6e }, 0, .vex_128_w1, .avx }, + .{ .vmovd, .mr, &.{ .rm32, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .vex_128_w0, .avx }, + .{ .vmovq, .mr, &.{ .rm64, .xmm }, &.{ 0x66, 0x0f, 0x7e }, 0, .vex_128_w1, .avx }, + .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, .{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx }, + .{ .vmovdqa, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6f }, 0, .vex_128_wig, .avx }, + .{ .vmovdqa, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x7f }, 0, .vex_128_wig, .avx }, + .{ .vmovdqa, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6f }, 0, .vex_256_wig, .avx }, + .{ .vmovdqa, .mr, &.{ .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x7f }, 0, .vex_256_wig, .avx }, + + .{ .vmovdqu, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x6f }, 0, .vex_128_wig, .avx }, + .{ .vmovdqu, .mr, &.{ .xmm_m128, .xmm }, &.{ 0xf3, 0x0f, 0x7f }, 0, .vex_128_wig, .avx }, + .{ .vmovdqu, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf3, 0x0f, 0x6f }, 0, .vex_256_wig, .avx }, + .{ .vmovdqu, .mr, &.{ .ymm_m256, .ymm }, &.{ 0xf3, 0x0f, 0x7f }, 0, .vex_256_wig, .avx }, + .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, .{ .vmovlhps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x16 }, 0, .vex_128_wig, .avx }, + .{ .vmovq, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0x7e }, 0, .vex_128_wig, .avx }, + .{ .vmovq, .mr, &.{ .xmm_m64, .xmm }, &.{ 0x66, 0x0f, 0xd6 }, 0, .vex_128_wig, .avx }, + .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, @@ -1212,6 +1255,16 @@ pub const table = [_]Entry{ .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + .{ .vorpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .vex_128_wig, .avx }, + .{ .vorpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x56 }, 0, .vex_256_wig, .avx }, + + .{ .vorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .vex_128_wig, .avx }, + .{ .vorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x56 }, 0, .vex_256_wig, .avx }, + + .{ .vpand, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_128_wig, .avx }, + + .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx }, + .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx }, .{ .vpextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx }, .{ .vpextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx }, @@ -1225,6 +1278,8 @@ pub const table = [_]Entry{ .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, + .{ .vpor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx }, + .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx }, .{ .vpsrlw, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_128_wig, .avx }, .{ .vpsrld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_128_wig, .avx }, @@ -1242,6 +1297,8 @@ pub const table = [_]Entry{ .{ .vpunpckldq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_128_wig, .avx }, .{ .vpunpcklqdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_128_wig, .avx }, + .{ .vpxor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xef }, 0, .vex_128_wig, .avx }, + .{ .vroundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_128_wig, .avx }, .{ .vroundpd, .rmi, &.{ .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .vex_256_wig, .avx }, @@ -1278,6 +1335,12 @@ pub const table = [_]Entry{ .{ .vsubss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x5c }, 0, .vex_lig_wig, .avx }, + .{ .vxorpd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x57 }, 0, .vex_128_wig, .avx }, + .{ .vxorpd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x57 }, 0, .vex_256_wig, .avx }, + + .{ .vxorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x57 }, 0, .vex_128_wig, .avx }, + .{ .vxorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x57 }, 0, .vex_256_wig, .avx }, + // F16C .{ .vcvtph2ps, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_128_w0, .f16c }, .{ .vcvtph2ps, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x13 }, 0, .vex_256_w0, .f16c }, @@ -1313,6 +1376,12 @@ pub const table = [_]Entry{ .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vpand, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_256_wig, .avx2 }, + + .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, + + .{ .vpor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 }, + .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 }, .{ .vpsrlw, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x71 }, 2, .vex_256_wig, .avx2 }, .{ .vpsrld, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd2 }, 0, .vex_256_wig, .avx2 }, @@ -1329,5 +1398,7 @@ pub const table = [_]Entry{ .{ .vpunpcklwd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x61 }, 0, .vex_256_wig, .avx2 }, .{ .vpunpckldq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x62 }, 0, .vex_256_wig, .avx2 }, .{ .vpunpcklqdq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6c }, 0, .vex_256_wig, .avx2 }, + + .{ .vpxor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xef }, 0, .vex_256_wig, .avx2 }, }; // zig fmt: on diff --git a/src/type.zig b/src/type.zig index 6122afda62..bcbb9e2ea2 100644 --- a/src/type.zig +++ b/src/type.zig @@ -5433,8 +5433,18 @@ pub const Type = extern union { } } + // Works for vectors and vectors of integers. + pub fn maxInt(ty: Type, arena: Allocator, target: Target) !Value { + const scalar = try maxIntScalar(ty.scalarType(), arena, target); + if (ty.zigTypeTag() == .Vector and scalar.tag() != .the_only_possible_value) { + return Value.Tag.repeated.create(arena, scalar); + } else { + return scalar; + } + } + /// Asserts that self.zigTypeTag() == .Int. - pub fn maxInt(self: Type, arena: Allocator, target: Target) !Value { + pub fn maxIntScalar(self: Type, arena: Allocator, target: Target) !Value { assert(self.zigTypeTag() == .Int); const info = self.intInfo(target); diff --git a/test/behavior/floatop.zig b/test/behavior/floatop.zig index 9d17b05865..a3fd5b69e8 100644 --- a/test/behavior/floatop.zig +++ b/test/behavior/floatop.zig @@ -532,7 +532,6 @@ fn testFabs() !void { test "@fabs with vectors" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest; diff --git a/test/behavior/math.zig b/test/behavior/math.zig index 991521b62c..7a563c1727 100644 --- a/test/behavior/math.zig +++ b/test/behavior/math.zig @@ -1612,7 +1612,6 @@ test "absFloat" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_spirv64) return error.SkipZigTest; diff --git a/test/behavior/translate_c_macros.zig b/test/behavior/translate_c_macros.zig index aa08e8c9aa..b3d1a688fe 100644 --- a/test/behavior/translate_c_macros.zig +++ b/test/behavior/translate_c_macros.zig @@ -65,7 +65,6 @@ test "cast negative integer to pointer" { test "casting to union with a macro" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From bd771bec49fbb7845ad2635c0dd13aa971a81fee Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 15 May 2023 00:26:30 -0400 Subject: x86_64: implement integer vector add/sub --- src/arch/x86_64/CodeGen.zig | 80 ++++++++++++++++++++++++++++++++++++++++--- src/arch/x86_64/Encoding.zig | 6 +++- src/arch/x86_64/Mir.zig | 11 ++++++ src/arch/x86_64/encodings.zig | 69 +++++++++++++++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 6 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 9d5f877e14..b791ec5ecc 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -6520,6 +6520,57 @@ fn genBinOp( }, .Vector => switch (lhs_ty.childType().zigTypeTag()) { else => null, + .Int => switch (lhs_ty.childType().intInfo(self.target.*).bits) { + 8 => switch (lhs_ty.vectorLen()) { + 1...16 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_b, .add } else .{ .p_b, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_b, .sub } else .{ .p_b, .sub }, + else => null, + }, + else => null, + }, + 16 => switch (lhs_ty.vectorLen()) { + 1...8 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_w, .add } else .{ .p_w, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_w, .sub } else .{ .p_w, .sub }, + else => null, + }, + else => null, + }, + 32 => switch (lhs_ty.vectorLen()) { + 1...4 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_d, .add } else .{ .p_d, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_d, .sub } else .{ .p_d, .sub }, + else => null, + }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...2 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx)) .{ .vp_q, .add } else .{ .p_q, .add }, + .sub, + .subwrap, + => if (self.hasFeature(.avx)) .{ .vp_q, .sub } else .{ .p_q, .sub }, + else => null, + }, + else => null, + }, + else => null, + }, .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { 16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) { 1 => { @@ -6812,7 +6863,7 @@ fn genBinOp( ); } switch (air_tag) { - .add, .sub, .mul, .div_float, .div_exact => {}, + .add, .addwrap, .sub, .subwrap, .mul, .mulwrap, .div_float, .div_exact => {}, .div_trunc, .div_floor => try self.genRound( lhs_ty, dst_reg, @@ -9043,14 +9094,33 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr .{ .register = try self.copyToTmpRegister(ty, src_mcv) }, ), .sse => try self.asmRegisterRegister( - switch (ty.scalarType().zigTypeTag()) { - else => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + if (@as(?Mir.Inst.FixedTag, switch (ty.scalarType().zigTypeTag()) { + else => switch (abi_size) { + 1...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, + 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, + 9...16 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + 17...32 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else null, + else => null, + }, .Float => switch (ty.floatBits(self.target.*)) { - else => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else .{ ._, .movdqa }, + 16, 128 => switch (abi_size) { + 2...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, + 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, + 9...16 => if (self.hasFeature(.avx)) + .{ .v_, .movdqa } + else + .{ ._, .movdqa }, + 17...32 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else null, + else => null, + }, 32 => if (self.hasFeature(.avx)) .{ .v_ps, .mova } else .{ ._ps, .mova }, 64 => if (self.hasFeature(.avx)) .{ .v_pd, .mova } else .{ ._pd, .mova }, + 80 => null, + else => unreachable, }, - }, + })) |tag| tag else return self.fail("TODO implement genSetReg for {}", .{ + ty.fmt(self.bin_file.options.module.?), + }), registerAlias(dst_reg, abi_size), registerAlias(src_reg, abi_size), ), diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 4014947673..c8919d062d 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -262,7 +262,9 @@ pub const Mnemonic = enum { fisttp, fld, // MMX movd, movq, + paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw, pand, pandn, por, pxor, + psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw, // SSE addps, addss, andps, @@ -341,12 +343,14 @@ pub const Mnemonic = enum { vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, vorpd, vorps, + vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw, vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, vpinsrb, vpinsrd, vpinsrq, vpinsrw, vpor, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, + vpsubb, vpsubd, vpsubq, vpsubsb, vpsubsw, vpsubusb, vpsubusw, vpsubw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, vpunpcklbw, vpunpckldq, vpunpcklqdq, vpunpcklwd, vpxor, @@ -746,7 +750,7 @@ fn estimateInstructionLength(prefix: Prefix, encoding: Encoding, ops: []const Op } const mnemonic_to_encodings_map = init: { - @setEvalBranchQuota(25_000); + @setEvalBranchQuota(30_000); const encodings = @import("encodings.zig"); var entries = encodings.table; std.sort.sort(encodings.Entry, &entries, {}, struct { diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 4d1f59e454..58eab29958 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -288,6 +288,7 @@ pub const Inst = struct { /// Add with carry adc, /// Add + /// Add packed integers /// Add packed single-precision floating-point values /// Add scalar single-precision floating-point values /// Add packed double-precision floating-point values @@ -420,6 +421,7 @@ pub const Inst = struct { /// Double precision shift right sh, /// Subtract + /// Subtract packed integers /// Subtract packed single-precision floating-point values /// Subtract scalar single-precision floating-point values /// Subtract packed double-precision floating-point values @@ -444,9 +446,18 @@ pub const Inst = struct { /// Bitwise logical xor of packed double-precision floating-point values xor, + /// Add packed signed integers with signed saturation + adds, + /// Add packed unsigned integers with unsigned saturation + addus, /// Bitwise logical and not of packed single-precision floating-point values /// Bitwise logical and not of packed double-precision floating-point values andn, + /// Subtract packed signed integers with signed saturation + subs, + /// Subtract packed unsigned integers with unsigned saturation + subus, + /// Convert packed doubleword integers to packed single-precision floating-point values /// Convert packed doubleword integers to packed double-precision floating-point values cvtpi2, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 3e57be61ea..820fd715ba 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -992,6 +992,17 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, + .{ .paddb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .none, .sse2 }, + .{ .paddw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .none, .sse2 }, + .{ .paddd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .none, .sse2 }, + .{ .paddq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd4 }, 0, .none, .sse2 }, + + .{ .paddsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xec }, 0, .none, .sse2 }, + .{ .paddsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xed }, 0, .none, .sse2 }, + + .{ .paddusb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdc }, 0, .none, .sse2 }, + .{ .paddusw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdd }, 0, .none, .sse2 }, + .{ .pand, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .none, .sse2 }, .{ .pandn, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .none, .sse2 }, @@ -1013,6 +1024,18 @@ pub const table = [_]Entry{ .{ .psrlq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .none, .sse2 }, .{ .psrlq, .mi, &.{ .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .none, .sse2 }, + .{ .psubb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .none, .sse2 }, + .{ .psubw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .none, .sse2 }, + .{ .psubd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfa }, 0, .none, .sse2 }, + + .{ .psubsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe8 }, 0, .none, .sse2 }, + .{ .psubsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe9 }, 0, .none, .sse2 }, + + .{ .psubq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfb }, 0, .none, .sse2 }, + + .{ .psubusb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .none, .sse2 }, + .{ .psubusw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .none, .sse2 }, + .{ .punpckhbw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .none, .sse2 }, .{ .punpckhwd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .none, .sse2 }, .{ .punpckhdq, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .none, .sse2 }, @@ -1261,6 +1284,17 @@ pub const table = [_]Entry{ .{ .vorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .vex_128_wig, .avx }, .{ .vorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x56 }, 0, .vex_256_wig, .avx }, + .{ .vpaddb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_128_wig, .avx }, + .{ .vpaddw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_128_wig, .avx }, + .{ .vpaddd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_128_wig, .avx }, + .{ .vpaddq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd4 }, 0, .vex_128_wig, .avx }, + + .{ .vpaddsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xec }, 0, .vex_128_wig, .avx }, + .{ .vpaddsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xed }, 0, .vex_128_wig, .avx }, + + .{ .vpaddusb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdc }, 0, .vex_128_wig, .avx }, + .{ .vpaddusw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdd }, 0, .vex_128_wig, .avx }, + .{ .vpand, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_128_wig, .avx }, .{ .vpandn, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_128_wig, .avx }, @@ -1287,6 +1321,18 @@ pub const table = [_]Entry{ .{ .vpsrlq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_128_wig, .avx }, .{ .vpsrlq, .vmi, &.{ .xmm, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_128_wig, .avx }, + .{ .vpsubb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_128_wig, .avx }, + .{ .vpsubw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_128_wig, .avx }, + .{ .vpsubd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfa }, 0, .vex_128_wig, .avx }, + + .{ .vpsubsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe8 }, 0, .vex_128_wig, .avx }, + .{ .vpsubsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe9 }, 0, .vex_128_wig, .avx }, + + .{ .vpsubq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfb }, 0, .vex_128_wig, .avx }, + + .{ .vpsubusb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .vex_128_wig, .avx }, + .{ .vpsubusw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .vex_128_wig, .avx }, + .{ .vpunpckhbw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_128_wig, .avx }, .{ .vpunpckhwd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_128_wig, .avx }, .{ .vpunpckhdq, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_128_wig, .avx }, @@ -1376,6 +1422,17 @@ pub const table = [_]Entry{ .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vpaddb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd4 }, 0, .vex_256_wig, .avx2 }, + + .{ .vpaddsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xec }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xed }, 0, .vex_256_wig, .avx2 }, + + .{ .vpaddusb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdc }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddusw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdd }, 0, .vex_256_wig, .avx2 }, + .{ .vpand, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdb }, 0, .vex_256_wig, .avx2 }, .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, @@ -1389,6 +1446,18 @@ pub const table = [_]Entry{ .{ .vpsrlq, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd3 }, 0, .vex_256_wig, .avx2 }, .{ .vpsrlq, .vmi, &.{ .ymm, .ymm, .imm8 }, &.{ 0x66, 0x0f, 0x73 }, 2, .vex_256_wig, .avx2 }, + .{ .vpsubb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf8 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xf9 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfa }, 0, .vex_256_wig, .avx2 }, + + .{ .vpsubsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe8 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe9 }, 0, .vex_256_wig, .avx2 }, + + .{ .vpsubq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfb }, 0, .vex_256_wig, .avx2 }, + + .{ .vpsubusb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd8 }, 0, .vex_256_wig, .avx2 }, + .{ .vpsubusw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd9 }, 0, .vex_256_wig, .avx2 }, + .{ .vpunpckhbw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x68 }, 0, .vex_256_wig, .avx2 }, .{ .vpunpckhwd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x69 }, 0, .vex_256_wig, .avx2 }, .{ .vpunpckhdq, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6a }, 0, .vex_256_wig, .avx2 }, -- cgit v1.2.3 From f39ff6cc68ab7a0d8ef349d4d930118890c19b01 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 15 May 2023 01:15:37 -0400 Subject: x86_64: implement integer vector mul --- src/arch/x86_64/CodeGen.zig | 59 +++++++++++++++++++++++++++++++++++++++++-- src/arch/x86_64/Encoding.zig | 3 +++ src/arch/x86_64/Mir.zig | 4 +++ src/arch/x86_64/encodings.zig | 24 +++++++++++++++--- test/behavior/vector.zig | 3 ++- 5 files changed, 87 insertions(+), 6 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index b791ec5ecc..c5af53b2cf 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2800,8 +2800,10 @@ fn airMulDivBinOp(self: *Self, inst: Air.Inst.Index) !void { const result = result: { const tag = self.air.instructions.items(.tag)[inst]; const dst_ty = self.air.typeOfIndex(inst); - if (dst_ty.zigTypeTag() == .Float) - break :result try self.genBinOp(inst, tag, bin_op.lhs, bin_op.rhs); + switch (dst_ty.zigTypeTag()) { + .Float, .Vector => break :result try self.genBinOp(inst, tag, bin_op.lhs, bin_op.rhs), + else => {}, + } const dst_info = dst_ty.intInfo(self.target.*); var src_pl = Type.Payload.Bits{ .base = .{ .tag = switch (dst_info.signedness) { @@ -6531,6 +6533,15 @@ fn genBinOp( => if (self.hasFeature(.avx)) .{ .vp_b, .sub } else .{ .p_b, .sub }, else => null, }, + 17...32 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx2)) .{ .vp_b, .add } else null, + .sub, + .subwrap, + => if (self.hasFeature(.avx2)) .{ .vp_b, .sub } else null, + else => null, + }, else => null, }, 16 => switch (lhs_ty.vectorLen()) { @@ -6541,6 +6552,21 @@ fn genBinOp( .sub, .subwrap, => if (self.hasFeature(.avx)) .{ .vp_w, .sub } else .{ .p_w, .sub }, + .mul, + .mulwrap, + => if (self.hasFeature(.avx)) .{ .vp_w, .mull } else .{ .p_d, .mull }, + else => null, + }, + 9...16 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx2)) .{ .vp_w, .add } else null, + .sub, + .subwrap, + => if (self.hasFeature(.avx2)) .{ .vp_w, .sub } else null, + .mul, + .mulwrap, + => if (self.hasFeature(.avx2)) .{ .vp_w, .mull } else null, else => null, }, else => null, @@ -6553,6 +6579,26 @@ fn genBinOp( .sub, .subwrap, => if (self.hasFeature(.avx)) .{ .vp_d, .sub } else .{ .p_d, .sub }, + .mul, + .mulwrap, + => if (self.hasFeature(.avx)) + .{ .vp_d, .mull } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .mull } + else + null, + else => null, + }, + 5...8 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx2)) .{ .vp_d, .add } else null, + .sub, + .subwrap, + => if (self.hasFeature(.avx2)) .{ .vp_d, .sub } else null, + .mul, + .mulwrap, + => if (self.hasFeature(.avx2)) .{ .vp_d, .mull } else null, else => null, }, else => null, @@ -6567,6 +6613,15 @@ fn genBinOp( => if (self.hasFeature(.avx)) .{ .vp_q, .sub } else .{ .p_q, .sub }, else => null, }, + 3...4 => switch (air_tag) { + .add, + .addwrap, + => if (self.hasFeature(.avx2)) .{ .vp_q, .add } else null, + .sub, + .subwrap, + => if (self.hasFeature(.avx2)) .{ .vp_q, .sub } else null, + else => null, + }, else => null, }, else => null, diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index c8919d062d..7b029cdb4f 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -264,6 +264,7 @@ pub const Mnemonic = enum { movd, movq, paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw, pand, pandn, por, pxor, + pmulhw, pmullw, psubb, psubd, psubq, psubsb, psubsw, psubusb, psubusw, psubw, // SSE addps, addss, @@ -317,6 +318,7 @@ pub const Mnemonic = enum { insertps, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, + pmulld, roundpd, roundps, roundsd, roundss, // AVX vaddpd, vaddps, vaddsd, vaddss, @@ -347,6 +349,7 @@ pub const Mnemonic = enum { vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, vpinsrb, vpinsrd, vpinsrq, vpinsrw, + vpmulhw, vpmulld, vpmullw, vpor, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 58eab29958..a18792e6aa 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -453,6 +453,10 @@ pub const Inst = struct { /// Bitwise logical and not of packed single-precision floating-point values /// Bitwise logical and not of packed double-precision floating-point values andn, + /// Multiply packed signed integers and store low result + mull, + /// Multiply packed signed integers and store high result + mulh, /// Subtract packed signed integers with signed saturation subs, /// Subtract packed unsigned integers with unsigned saturation diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 820fd715ba..86a79596cd 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -1011,6 +1011,10 @@ pub const table = [_]Entry{ .{ .pinsrw, .rmi, &.{ .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, + .{ .pmulhw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .none, .sse2 }, + + .{ .pmullw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 }, + .{ .por, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .none, .sse2 }, .{ .pshufhw, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0xf3, 0x0f, 0x70 }, 0, .none, .sse2 }, @@ -1087,6 +1091,8 @@ pub const table = [_]Entry{ .{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 }, .{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 }, + .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 }, + .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 }, @@ -1312,6 +1318,12 @@ pub const table = [_]Entry{ .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, + .{ .vpmulhw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx }, + + .{ .vpmulld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx }, + + .{ .vpmullw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd5 }, 0, .vex_128_wig, .avx }, + .{ .vpor, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_128_wig, .avx }, .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx }, @@ -1418,9 +1430,9 @@ pub const table = [_]Entry{ .{ .vfmadd231ss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0x66, 0x0f, 0x38, 0xb9 }, 0, .vex_lig_w0, .fma }, // AVX2 - .{ .vbroadcastss, .rm, &.{ .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx2 }, - .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, - .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vbroadcastss, .rm, &.{ .xmm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx2 }, + .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, + .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, .{ .vpaddb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_256_wig, .avx2 }, .{ .vpaddw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_256_wig, .avx2 }, @@ -1437,6 +1449,12 @@ pub const table = [_]Entry{ .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, + .{ .vpmulhw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx }, + + .{ .vpmulld, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx }, + + .{ .vpmullw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xd5 }, 0, .vex_256_wig, .avx }, + .{ .vpor, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xeb }, 0, .vex_256_wig, .avx2 }, .{ .vpsrlw, .rvm, &.{ .ymm, .ymm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_256_wig, .avx2 }, diff --git a/test/behavior/vector.zig b/test/behavior/vector.zig index 87ccdfb567..5d217a5ce0 100644 --- a/test/behavior/vector.zig +++ b/test/behavior/vector.zig @@ -26,7 +26,8 @@ test "implicit cast vector to array - bool" { test "vector wrap operators" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From cea9ac772a518ff249d47fc2cb7b2776c786ac07 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 15 May 2023 02:55:41 -0400 Subject: x86_64: implement integer vector min/max --- src/arch/x86_64/CodeGen.zig | 100 ++++++++++++++++++++++++++++++++++++++++++ src/arch/x86_64/Encoding.zig | 4 ++ src/arch/x86_64/Mir.zig | 8 ++++ src/arch/x86_64/encodings.zig | 58 ++++++++++++++++++++++++ 4 files changed, 170 insertions(+) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index ed2c596f8f..2cd5721258 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -6534,6 +6534,34 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx)) .{ .vp_, .@"and" } else .{ .p_, .@"and" }, .bit_or => if (self.hasFeature(.avx)) .{ .vp_, .@"or" } else .{ .p_, .@"or" }, .xor => if (self.hasFeature(.avx)) .{ .vp_, .xor } else .{ .p_, .xor }, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_b, .mins } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .mins } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_b, .minu } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .minu } + else + null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_b, .maxs } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .maxs } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_b, .maxu } + else if (self.hasFeature(.sse4_1)) + .{ .p_b, .maxu } + else + null, + }, else => null, }, 17...32 => switch (air_tag) { @@ -6546,6 +6574,14 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx2)) .{ .vp_, .@"and" } else null, .bit_or => if (self.hasFeature(.avx2)) .{ .vp_, .@"or" } else null, .xor => if (self.hasFeature(.avx2)) .{ .vp_, .xor } else null, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_b, .mins } else null, + .unsigned => if (self.hasFeature(.avx)) .{ .vp_b, .minu } else null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_b, .maxs } else null, + .unsigned => if (self.hasFeature(.avx2)) .{ .vp_b, .maxu } else null, + }, else => null, }, else => null, @@ -6564,6 +6600,26 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx)) .{ .vp_, .@"and" } else .{ .p_, .@"and" }, .bit_or => if (self.hasFeature(.avx)) .{ .vp_, .@"or" } else .{ .p_, .@"or" }, .xor => if (self.hasFeature(.avx)) .{ .vp_, .xor } else .{ .p_, .xor }, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_w, .mins } + else + .{ .p_w, .mins }, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_w, .minu } + else + .{ .p_w, .minu }, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_w, .maxs } + else + .{ .p_w, .maxs }, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_w, .maxu } + else + .{ .p_w, .maxu }, + }, else => null, }, 9...16 => switch (air_tag) { @@ -6579,6 +6635,14 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx2)) .{ .vp_, .@"and" } else null, .bit_or => if (self.hasFeature(.avx2)) .{ .vp_, .@"or" } else null, .xor => if (self.hasFeature(.avx2)) .{ .vp_, .xor } else null, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_w, .mins } else null, + .unsigned => if (self.hasFeature(.avx)) .{ .vp_w, .minu } else null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_w, .maxs } else null, + .unsigned => if (self.hasFeature(.avx2)) .{ .vp_w, .maxu } else null, + }, else => null, }, else => null, @@ -6602,6 +6666,34 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx)) .{ .vp_, .@"and" } else .{ .p_, .@"and" }, .bit_or => if (self.hasFeature(.avx)) .{ .vp_, .@"or" } else .{ .p_, .@"or" }, .xor => if (self.hasFeature(.avx)) .{ .vp_, .xor } else .{ .p_, .xor }, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_d, .mins } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .mins } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_d, .minu } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .minu } + else + null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx)) + .{ .vp_d, .maxs } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .maxs } + else + null, + .unsigned => if (self.hasFeature(.avx)) + .{ .vp_d, .maxu } + else if (self.hasFeature(.sse4_1)) + .{ .p_d, .maxu } + else + null, + }, else => null, }, 5...8 => switch (air_tag) { @@ -6617,6 +6709,14 @@ fn genBinOp( .bit_and => if (self.hasFeature(.avx2)) .{ .vp_, .@"and" } else null, .bit_or => if (self.hasFeature(.avx2)) .{ .vp_, .@"or" } else null, .xor => if (self.hasFeature(.avx2)) .{ .vp_, .xor } else null, + .min => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_d, .mins } else null, + .unsigned => if (self.hasFeature(.avx)) .{ .vp_d, .minu } else null, + }, + .max => switch (lhs_ty.childType().intInfo(self.target.*).signedness) { + .signed => if (self.hasFeature(.avx2)) .{ .vp_d, .maxs } else null, + .unsigned => if (self.hasFeature(.avx2)) .{ .vp_d, .maxu } else null, + }, else => null, }, else => null, diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 7b029cdb4f..52d010880e 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -280,6 +280,7 @@ pub const Mnemonic = enum { mulps, mulss, orps, pextrw, pinsrw, + pmaxsw, pmaxub, pminsw, pminub, shufps, sqrtps, sqrtss, subps, subss, @@ -318,6 +319,7 @@ pub const Mnemonic = enum { insertps, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, + pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw, pmulld, roundpd, roundps, roundsd, roundss, // AVX @@ -349,6 +351,8 @@ pub const Mnemonic = enum { vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, vpinsrb, vpinsrd, vpinsrq, vpinsrw, + vpmaxsb, vpmaxsd, vpmaxsw, vpmaxub, vpmaxud, vpmaxuw, + vpminsb, vpminsd, vpminsw, vpminub, vpminud, vpminuw, vpmulhw, vpmulld, vpmullw, vpor, vpshufhw, vpshuflw, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index a18792e6aa..4483de858e 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -453,6 +453,14 @@ pub const Inst = struct { /// Bitwise logical and not of packed single-precision floating-point values /// Bitwise logical and not of packed double-precision floating-point values andn, + /// Maximum of packed signed integers + maxs, + /// Maximum of packed unsigned integers + maxu, + /// Minimum of packed signed integers + mins, + /// Minimum of packed unsigned integers + minu, /// Multiply packed signed integers and store low result mull, /// Multiply packed signed integers and store high result diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index 86a79596cd..c326f4230a 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -1011,6 +1011,14 @@ pub const table = [_]Entry{ .{ .pinsrw, .rmi, &.{ .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .none, .sse2 }, + .{ .pmaxsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xee }, 0, .none, .sse2 }, + + .{ .pmaxub, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xde }, 0, .none, .sse2 }, + + .{ .pminsw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xea }, 0, .none, .sse2 }, + + .{ .pminub, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xda }, 0, .none, .sse2 }, + .{ .pmulhw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .none, .sse2 }, .{ .pmullw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd5 }, 0, .none, .sse2 }, @@ -1091,6 +1099,20 @@ pub const table = [_]Entry{ .{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 }, .{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 }, + .{ .pmaxsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .none, .sse4_1 }, + .{ .pmaxsd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .none, .sse4_1 }, + + .{ .pmaxuw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3e }, 0, .none, .sse4_1 }, + + .{ .pmaxud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3f }, 0, .none, .sse4_1 }, + + .{ .pminsb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x38 }, 0, .none, .sse4_1 }, + .{ .pminsd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x39 }, 0, .none, .sse4_1 }, + + .{ .pminuw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3a }, 0, .none, .sse4_1 }, + + .{ .pminud, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .none, .sse4_1 }, + .{ .pmulld, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .none, .sse4_1 }, .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, @@ -1318,6 +1340,24 @@ pub const table = [_]Entry{ .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, + .{ .vpmaxsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_128_wig, .avx }, + .{ .vpmaxsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xee }, 0, .vex_128_wig, .avx }, + .{ .vpmaxsd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_128_wig, .avx }, + + .{ .vpmaxub, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xde }, 0, .vex_128_wig, .avx }, + .{ .vpmaxuw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3e }, 0, .vex_128_wig, .avx }, + + .{ .vpmaxud, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3f }, 0, .vex_128_wig, .avx }, + + .{ .vpminsb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x38 }, 0, .vex_128_wig, .avx }, + .{ .vpminsw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xea }, 0, .vex_128_wig, .avx }, + .{ .vpminsd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x39 }, 0, .vex_128_wig, .avx }, + + .{ .vpminub, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xda }, 0, .vex_128_wig, .avx }, + .{ .vpminuw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3a }, 0, .vex_128_wig, .avx }, + + .{ .vpminud, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .vex_128_wig, .avx }, + .{ .vpmulhw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_128_wig, .avx }, .{ .vpmulld, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_128_wig, .avx }, @@ -1449,6 +1489,24 @@ pub const table = [_]Entry{ .{ .vpandn, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xdf }, 0, .vex_256_wig, .avx2 }, + .{ .vpmaxsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3c }, 0, .vex_256_wig, .avx }, + .{ .vpmaxsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xee }, 0, .vex_256_wig, .avx }, + .{ .vpmaxsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3d }, 0, .vex_256_wig, .avx }, + + .{ .vpmaxub, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xde }, 0, .vex_256_wig, .avx }, + .{ .vpmaxuw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3e }, 0, .vex_256_wig, .avx }, + + .{ .vpmaxud, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3f }, 0, .vex_256_wig, .avx }, + + .{ .vpminsb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x38 }, 0, .vex_256_wig, .avx }, + .{ .vpminsw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xea }, 0, .vex_256_wig, .avx }, + .{ .vpminsd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x39 }, 0, .vex_256_wig, .avx }, + + .{ .vpminub, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xda }, 0, .vex_256_wig, .avx }, + .{ .vpminuw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3a }, 0, .vex_256_wig, .avx }, + + .{ .vpminud, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x3b }, 0, .vex_256_wig, .avx }, + .{ .vpmulhw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xe5 }, 0, .vex_256_wig, .avx }, .{ .vpmulld, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x40 }, 0, .vex_256_wig, .avx }, -- cgit v1.2.3 From 403c2d91bed456085eb685a9f89996c4635ce4b9 Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Mon, 15 May 2023 20:17:06 -0400 Subject: x86_64: fix float min/max behavior --- src/arch/x86_64/CodeGen.zig | 302 ++++++++++++++++++++++++++++++++++++-- src/arch/x86_64/Encoding.zig | 33 +++-- src/arch/x86_64/Lower.zig | 7 + src/arch/x86_64/Mir.zig | 20 +++ src/arch/x86_64/encoder.zig | 31 ++-- src/arch/x86_64/encodings.zig | 34 +++++ test/behavior/maximum_minimum.zig | 6 +- 7 files changed, 393 insertions(+), 40 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 2cd5721258..7ea0db516b 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1271,6 +1271,27 @@ fn asmRegisterRegisterRegister( }); } +fn asmRegisterRegisterRegisterRegister( + self: *Self, + tag: Mir.Inst.FixedTag, + reg1: Register, + reg2: Register, + reg3: Register, + reg4: Register, +) !void { + _ = try self.addInst(.{ + .tag = tag[1], + .ops = .rrrr, + .data = .{ .rrrr = .{ + .fixes = tag[0], + .r1 = reg1, + .r2 = reg2, + .r3 = reg3, + .r4 = reg4, + } }, + }); +} + fn asmRegisterRegisterRegisterImmediate( self: *Self, tag: Mir.Inst.FixedTag, @@ -6224,12 +6245,26 @@ fn genBinOp( lhs_air: Air.Inst.Ref, rhs_air: Air.Inst.Ref, ) !MCValue { - const lhs_mcv = try self.resolveInst(lhs_air); - const rhs_mcv = try self.resolveInst(rhs_air); const lhs_ty = self.air.typeOf(lhs_air); const rhs_ty = self.air.typeOf(rhs_air); const abi_size = @intCast(u32, lhs_ty.abiSize(self.target.*)); + const maybe_mask_reg = switch (air_tag) { + else => null, + .max, .min => if (lhs_ty.scalarType().isRuntimeFloat()) registerAlias( + if (!self.hasFeature(.avx) and self.hasFeature(.sse4_1)) mask: { + try self.register_manager.getReg(.xmm0, null); + break :mask .xmm0; + } else try self.register_manager.allocReg(null, sse), + abi_size, + ) else null, + }; + const mask_lock = + if (maybe_mask_reg) |mask_reg| self.register_manager.lockRegAssumeUnused(mask_reg) else null; + defer if (mask_lock) |lock| self.register_manager.unlockReg(lock); + + const lhs_mcv = try self.resolveInst(lhs_air); + const rhs_mcv = try self.resolveInst(rhs_air); switch (lhs_mcv) { .immediate => |imm| switch (imm) { 0 => switch (air_tag) { @@ -6300,7 +6335,16 @@ fn genBinOp( }; defer if (dst_lock) |lock| self.register_manager.unlockReg(lock); - const src_mcv = if (flipped) lhs_mcv else rhs_mcv; + const unmat_src_mcv = if (flipped) lhs_mcv else rhs_mcv; + const src_mcv: MCValue = if (maybe_mask_reg) |mask_reg| + if (self.hasFeature(.avx) and unmat_src_mcv.isRegister() and maybe_inst != null and + self.liveness.operandDies(maybe_inst.?, if (flipped) 0 else 1)) unmat_src_mcv else src: { + try self.genSetReg(mask_reg, rhs_ty, unmat_src_mcv); + break :src .{ .register = mask_reg }; + } + else + unmat_src_mcv; + if (!vec_op) { switch (air_tag) { .add, @@ -7009,18 +7053,26 @@ fn genBinOp( })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), }); + + const lhs_copy_reg = if (maybe_mask_reg) |_| registerAlias( + if (copied_to_dst) try self.copyToTmpRegister(lhs_ty, dst_mcv) else lhs_mcv.getReg().?, + abi_size, + ) else null; + const lhs_copy_lock = if (lhs_copy_reg) |reg| self.register_manager.lockReg(reg) else null; + defer if (lhs_copy_lock) |lock| self.register_manager.unlockReg(lock); + if (self.hasFeature(.avx)) { - const src1_alias = + const lhs_reg = if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size); if (src_mcv.isMemory()) try self.asmRegisterRegisterMemory( mir_tag, dst_reg, - src1_alias, + lhs_reg, src_mcv.mem(Memory.PtrSize.fromSize(abi_size)), ) else try self.asmRegisterRegisterRegister( mir_tag, dst_reg, - src1_alias, + lhs_reg, registerAlias(if (src_mcv.isRegister()) src_mcv.getReg().? else @@ -7041,9 +7093,10 @@ fn genBinOp( try self.copyToTmpRegister(rhs_ty, src_mcv), abi_size), ); } + switch (air_tag) { .add, .addwrap, .sub, .subwrap, .mul, .mulwrap, .div_float, .div_exact => {}, - .div_trunc, .div_floor => try self.genRound( + .div_trunc, .div_floor => if (self.hasFeature(.sse4_1)) try self.genRound( lhs_ty, dst_reg, .{ .register = dst_reg }, @@ -7052,11 +7105,240 @@ fn genBinOp( .div_floor => 0b1_0_01, else => unreachable, }, - ), + ) else return self.fail("TODO implement genBinOp for {s} {} without sse4_1 feature", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), .bit_and, .bit_or, .xor => {}, - .max, .min => {}, // TODO: unordered select + .max, .min => if (maybe_mask_reg) |mask_reg| if (self.hasFeature(.avx)) { + const rhs_copy_reg = registerAlias(src_mcv.getReg().?, abi_size); + + try self.asmRegisterRegisterRegisterImmediate( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ .v_ss, .cmp }, + 64 => .{ .v_sd, .cmp }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1 => .{ .v_ss, .cmp }, + 2...8 => .{ .v_ps, .cmp }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1 => .{ .v_sd, .cmp }, + 2...4 => .{ .v_pd, .cmp }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + mask_reg, + rhs_copy_reg, + rhs_copy_reg, + Immediate.u(3), // unord + ); + try self.asmRegisterRegisterRegisterRegister( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ .v_ps, .blendv }, + 64 => .{ .v_pd, .blendv }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1...8 => .{ .v_ps, .blendv }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...4 => .{ .v_pd, .blendv }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + dst_reg, + dst_reg, + lhs_copy_reg.?, + mask_reg, + ); + } else { + const has_blend = self.hasFeature(.sse4_1); + try self.asmRegisterRegisterImmediate( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ ._ss, .cmp }, + 64 => .{ ._sd, .cmp }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1 => .{ ._ss, .cmp }, + 2...4 => .{ ._ps, .cmp }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1 => .{ ._sd, .cmp }, + 2 => .{ ._pd, .cmp }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + mask_reg, + mask_reg, + Immediate.u(if (has_blend) 3 else 7), // unord, ord + ); + if (has_blend) try self.asmRegisterRegisterRegister( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ ._ps, .blendv }, + 64 => .{ ._pd, .blendv }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1...4 => .{ ._ps, .blendv }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...2 => .{ ._pd, .blendv }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + dst_reg, + lhs_copy_reg.?, + mask_reg, + ) else { + try self.asmRegisterRegister( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ ._ps, .@"and" }, + 64 => .{ ._pd, .@"and" }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1...4 => .{ ._ps, .@"and" }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...2 => .{ ._pd, .@"and" }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + dst_reg, + mask_reg, + ); + try self.asmRegisterRegister( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ ._ps, .andn }, + 64 => .{ ._pd, .andn }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1...4 => .{ ._ps, .andn }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...2 => .{ ._pd, .andn }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + mask_reg, + lhs_copy_reg.?, + ); + try self.asmRegisterRegister( + if (@as(?Mir.Inst.FixedTag, switch (lhs_ty.zigTypeTag()) { + .Float => switch (lhs_ty.floatBits(self.target.*)) { + 32 => .{ ._ps, .@"or" }, + 64 => .{ ._pd, .@"or" }, + 16, 80, 128 => null, + else => unreachable, + }, + .Vector => switch (lhs_ty.childType().zigTypeTag()) { + .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 32 => switch (lhs_ty.vectorLen()) { + 1...4 => .{ ._ps, .@"or" }, + else => null, + }, + 64 => switch (lhs_ty.vectorLen()) { + 1...2 => .{ ._pd, .@"or" }, + else => null, + }, + 16, 80, 128 => null, + else => unreachable, + }, + else => unreachable, + }, + else => unreachable, + })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ + @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), + }), + dst_reg, + mask_reg, + ); + } + }, else => unreachable, } + return dst_mcv; } @@ -9282,7 +9564,7 @@ fn genSetReg(self: *Self, dst_reg: Register, ty: Type, src_mcv: MCValue) InnerEr 17...32 => if (self.hasFeature(.avx)) .{ .v_, .movdqa } else null, else => null, }, - .Float => switch (ty.floatBits(self.target.*)) { + .Float => switch (ty.scalarType().floatBits(self.target.*)) { 16, 128 => switch (abi_size) { 2...4 => if (self.hasFeature(.avx)) .{ .v_d, .mov } else .{ ._d, .mov }, 5...8 => if (self.hasFeature(.avx)) .{ .v_q, .mov } else .{ ._q, .mov }, diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 52d010880e..0aaf12013d 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -178,7 +178,7 @@ pub fn format( try writer.print("+{s} ", .{tag}); }, .m, .mi, .m1, .mc, .vmi => try writer.print("/{d} ", .{encoding.modRmExt()}), - .mr, .rm, .rmi, .mri, .mrc, .rvm, .rvmi, .mvr => try writer.writeAll("/r "), + .mr, .rm, .rmi, .mri, .mrc, .rm0, .rvm, .rvmr, .rvmi, .mvr => try writer.writeAll("/r "), } switch (encoding.data.op_en) { @@ -202,7 +202,8 @@ pub fn format( }; try writer.print("{s} ", .{tag}); }, - .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rvm, .mvr => {}, + .rvmr => try writer.writeAll("/is4 "), + .np, .fd, .td, .o, .m, .m1, .mc, .mr, .rm, .mrc, .rm0, .rvm, .mvr => {}, } try writer.print("{s} ", .{@tagName(encoding.mnemonic)}); @@ -270,7 +271,7 @@ pub const Mnemonic = enum { addps, addss, andps, andnps, - cmpss, + cmpps, cmpss, cvtpi2ps, cvtps2pi, cvtsi2ss, cvtss2si, cvttps2pi, cvttss2si, divps, divss, maxps, maxss, @@ -290,7 +291,7 @@ pub const Mnemonic = enum { addpd, addsd, andpd, andnpd, - //cmpsd, + cmppd, //cmpsd, cvtdq2pd, cvtdq2ps, cvtpd2dq, cvtpd2pi, cvtpd2ps, cvtpi2pd, cvtps2dq, cvtps2pd, cvtsd2si, cvtsd2ss, cvtsi2sd, cvtss2sd, cvttpd2dq, cvttpd2pi, cvttps2dq, cvttsd2si, @@ -315,6 +316,7 @@ pub const Mnemonic = enum { // SSE3 movddup, movshdup, movsldup, // SSE4.1 + blendpd, blendps, blendvpd, blendvps, extractps, insertps, pextrb, pextrd, pextrq, @@ -325,7 +327,9 @@ pub const Mnemonic = enum { // AVX vaddpd, vaddps, vaddsd, vaddss, vandnpd, vandnps, vandpd, vandps, + vblendpd, vblendps, vblendvpd, vblendvps, vbroadcastf128, vbroadcastsd, vbroadcastss, + vcmppd, vcmpps, vcmpsd, vcmpss, vcvtdq2pd, vcvtdq2ps, vcvtpd2dq, vcvtpd2ps, vcvtps2dq, vcvtps2pd, vcvtsd2si, vcvtsd2ss, vcvtsi2sd, vcvtsi2ss, vcvtss2sd, vcvtss2si, @@ -385,7 +389,7 @@ pub const OpEn = enum { fd, td, m1, mc, mi, mr, rm, rmi, mri, mrc, - vmi, rvm, rvmi, mvr, + rm0, vmi, rvm, rvmr, rvmi, mvr, // zig fmt: on }; @@ -407,7 +411,7 @@ pub const Op = enum { moffs, sreg, st, mm, mm_m64, - xmm, xmm_m32, xmm_m64, xmm_m128, + xmm0, xmm, xmm_m32, xmm_m64, xmm_m128, ymm, ymm_m256, // zig fmt: on @@ -436,7 +440,9 @@ pub const Op = enum { .segment => .sreg, .x87 => .st, .mmx => .mm, - .sse => switch (reg.bitSize()) { + .sse => if (reg == .xmm0) + .xmm0 + else switch (reg.bitSize()) { 128 => .xmm, 256 => .ymm, else => unreachable, @@ -494,7 +500,7 @@ pub const Op = enum { .eax, .r32, .rm32, .r32_m16 => unreachable, .rax, .r64, .rm64, .r64_m16 => unreachable, .st, .mm, .mm_m64 => unreachable, - .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable, + .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => unreachable, .ymm, .ymm_m256 => unreachable, .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, .unity => 1, @@ -516,7 +522,7 @@ pub const Op = enum { .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32, .rax, .r64, .rm64, .r64_m16, .mm, .mm_m64 => 64, .st => 80, - .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, + .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, .ymm, .ymm_m256 => 256, }; } @@ -526,7 +532,8 @@ pub const Op = enum { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, - .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .st, .mm, .xmm, .ymm => unreachable, + .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64 => unreachable, + .st, .mm, .xmm0, .xmm, .ymm => unreachable, .m8, .rm8, .r32_m8 => 8, .m16, .rm16, .r32_m16, .r64_m16 => 16, .m32, .rm32, .xmm_m32 => 32, @@ -558,7 +565,7 @@ pub const Op = enum { .rm8, .rm16, .rm32, .rm64, .r32_m8, .r32_m16, .r64_m16, .st, .mm, .mm_m64, - .xmm, .xmm_m32, .xmm_m64, .xmm_m128, + .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128, .ymm, .ymm_m256, => true, else => false, @@ -612,7 +619,7 @@ pub const Op = enum { .sreg => .segment, .st => .x87, .mm, .mm_m64 => .mmx, - .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .sse, + .xmm0, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .sse, .ymm, .ymm_m256 => .sse, }; } @@ -629,7 +636,7 @@ pub const Op = enum { else => { if (op.isRegister() and target.isRegister()) { return switch (target) { - .cl, .al, .ax, .eax, .rax => op == target, + .cl, .al, .ax, .eax, .rax, .xmm0 => op == target, else => op.class() == target.class() and op.regBitSize() == target.regBitSize(), }; } diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 65d2b64398..d77ddf3050 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -377,6 +377,7 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void { .r => inst.data.r.fixes, .rr => inst.data.rr.fixes, .rrr => inst.data.rrr.fixes, + .rrrr => inst.data.rrrr.fixes, .rrri => inst.data.rrri.fixes, .rri_s, .rri_u => inst.data.rri.fixes, .ri_s, .ri_u => inst.data.ri.fixes, @@ -430,6 +431,12 @@ fn generic(lower: *Lower, inst: Mir.Inst) Error!void { .{ .reg = inst.data.rrr.r2 }, .{ .reg = inst.data.rrr.r3 }, }, + .rrrr => &.{ + .{ .reg = inst.data.rrrr.r1 }, + .{ .reg = inst.data.rrrr.r2 }, + .{ .reg = inst.data.rrrr.r3 }, + .{ .reg = inst.data.rrrr.r4 }, + }, .rrri => &.{ .{ .reg = inst.data.rrri.r1 }, .{ .reg = inst.data.rrri.r2 }, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 4483de858e..9f59a2afba 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -596,6 +596,16 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, + /// Blend packed single-precision floating-point values + /// Blend scalar single-precision floating-point values + /// Blend packed double-precision floating-point values + /// Blend scalar double-precision floating-point values + blend, + /// Variable blend packed single-precision floating-point values + /// Variable blend scalar single-precision floating-point values + /// Variable blend packed double-precision floating-point values + /// Variable blend scalar double-precision floating-point values + blendv, /// Extract packed floating-point values extract, /// Insert scalar single-precision floating-point value @@ -651,6 +661,9 @@ pub const Inst = struct { /// Register, register, register operands. /// Uses `rrr` payload. rrr, + /// Register, register, register, register operands. + /// Uses `rrrr` payload. + rrrr, /// Register, register, register, immediate (byte) operands. /// Uses `rrri` payload. rrri, @@ -870,6 +883,13 @@ pub const Inst = struct { r2: Register, r3: Register, }, + rrrr: struct { + fixes: Fixes = ._, + r1: Register, + r2: Register, + r3: Register, + r4: Register, + }, rrri: struct { fixes: Fixes = ._, r1: Register, diff --git a/src/arch/x86_64/encoder.zig b/src/arch/x86_64/encoder.zig index 0ce875240d..5f9a2f49b3 100644 --- a/src/arch/x86_64/encoder.zig +++ b/src/arch/x86_64/encoder.zig @@ -226,8 +226,8 @@ pub const Instruction = struct { else => { const mem_op = switch (data.op_en) { .m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0], - .rm, .rmi, .vmi => inst.ops[1], - .rvm, .rvmi => inst.ops[2], + .rm, .rmi, .rm0, .vmi => inst.ops[1], + .rvm, .rvmr, .rvmi => inst.ops[2], else => unreachable, }; switch (mem_op) { @@ -235,7 +235,7 @@ pub const Instruction = struct { const rm = switch (data.op_en) { .m, .mi, .m1, .mc, .vmi => enc.modRmExt(), .mr, .mri, .mrc => inst.ops[1].reg.lowEnc(), - .rm, .rmi, .rvm, .rvmi => inst.ops[0].reg.lowEnc(), + .rm, .rmi, .rm0, .rvm, .rvmr, .rvmi => inst.ops[0].reg.lowEnc(), .mvr => inst.ops[2].reg.lowEnc(), else => unreachable, }; @@ -245,7 +245,7 @@ pub const Instruction = struct { const op = switch (data.op_en) { .m, .mi, .m1, .mc, .vmi => .none, .mr, .mri, .mrc => inst.ops[1], - .rm, .rmi, .rvm, .rvmi => inst.ops[0], + .rm, .rmi, .rm0, .rvm, .rvmr, .rvmi => inst.ops[0], .mvr => inst.ops[2], else => unreachable, }; @@ -257,6 +257,7 @@ pub const Instruction = struct { switch (data.op_en) { .mi => try encodeImm(inst.ops[1].imm, data.ops[1], encoder), .rmi, .mri, .vmi => try encodeImm(inst.ops[2].imm, data.ops[2], encoder), + .rvmr => try encoder.imm8(@as(u8, inst.ops[3].reg.enc()) << 4), .rvmi => try encodeImm(inst.ops[3].imm, data.ops[3], encoder), else => {}, } @@ -298,7 +299,7 @@ pub const Instruction = struct { .i, .zi, .o, .oi, .d, .np => null, .fd => inst.ops[1].mem.base().reg, .td => inst.ops[0].mem.base().reg, - .rm, .rmi => if (inst.ops[1].isSegmentRegister()) + .rm, .rmi, .rm0 => if (inst.ops[1].isSegmentRegister()) switch (inst.ops[1]) { .reg => |reg| reg, .mem => |mem| mem.base().reg, @@ -314,7 +315,7 @@ pub const Instruction = struct { } else null, - .vmi, .rvm, .rvmi, .mvr => unreachable, + .vmi, .rvm, .rvmr, .rvmi, .mvr => unreachable, }; if (segment_override) |seg| { legacy.setSegmentOverride(seg); @@ -333,23 +334,23 @@ pub const Instruction = struct { switch (op_en) { .np, .i, .zi, .fd, .td, .d => {}, .o, .oi => rex.b = inst.ops[0].reg.isExtended(), - .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc => { + .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rm0 => { const r_op = switch (op_en) { - .rm, .rmi => inst.ops[0], + .rm, .rmi, .rm0 => inst.ops[0], .mr, .mri, .mrc => inst.ops[1], else => .none, }; rex.r = r_op.isBaseExtended(); const b_x_op = switch (op_en) { - .rm, .rmi => inst.ops[1], + .rm, .rmi, .rm0 => inst.ops[1], .m, .mi, .m1, .mc, .mr, .mri, .mrc => inst.ops[0], else => unreachable, }; rex.b = b_x_op.isBaseExtended(); rex.x = b_x_op.isIndexExtended(); }, - .vmi, .rvm, .rvmi, .mvr => unreachable, + .vmi, .rvm, .rvmr, .rvmi, .mvr => unreachable, } try encoder.rex(rex); @@ -367,9 +368,9 @@ pub const Instruction = struct { switch (op_en) { .np, .i, .zi, .fd, .td, .d => {}, .o, .oi => vex.b = inst.ops[0].reg.isExtended(), - .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .vmi, .rvm, .rvmi, .mvr => { + .m, .mi, .m1, .mc, .mr, .rm, .rmi, .mri, .mrc, .rm0, .vmi, .rvm, .rvmr, .rvmi, .mvr => { const r_op = switch (op_en) { - .rm, .rmi, .rvm, .rvmi => inst.ops[0], + .rm, .rmi, .rm0, .rvm, .rvmr, .rvmi => inst.ops[0], .mr, .mri, .mrc => inst.ops[1], .mvr => inst.ops[2], .m, .mi, .m1, .mc, .vmi => .none, @@ -378,9 +379,9 @@ pub const Instruction = struct { vex.r = r_op.isBaseExtended(); const b_x_op = switch (op_en) { - .rm, .rmi, .vmi => inst.ops[1], + .rm, .rmi, .rm0, .vmi => inst.ops[1], .m, .mi, .m1, .mc, .mr, .mri, .mrc, .mvr => inst.ops[0], - .rvm, .rvmi => inst.ops[2], + .rvm, .rvmr, .rvmi => inst.ops[2], else => unreachable, }; vex.b = b_x_op.isBaseExtended(); @@ -408,7 +409,7 @@ pub const Instruction = struct { switch (op_en) { else => {}, .vmi => vex.v = inst.ops[0].reg, - .rvm, .rvmi => vex.v = inst.ops[1].reg, + .rvm, .rvmr, .rvmi => vex.v = inst.ops[1].reg, } try encoder.vex(vex); diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index c326f4230a..e087f6dfc7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -846,6 +846,8 @@ pub const table = [_]Entry{ .{ .andps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .none, .sse }, + .{ .cmpps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc2 }, 0, .none, .sse }, + .{ .cmpss, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .none, .sse }, .{ .cvtpi2ps, .rm, &.{ .xmm, .mm_m64 }, &.{ 0x0f, 0x2a }, 0, .none, .sse }, @@ -917,6 +919,8 @@ pub const table = [_]Entry{ .{ .andpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x54 }, 0, .none, .sse2 }, + .{ .cmppd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc2 }, 0, .none, .sse2 }, + .{ .cmpsd, .rmi, &.{ .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .none, .sse2 }, .{ .cvtdq2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .none, .sse2 }, @@ -1085,6 +1089,14 @@ pub const table = [_]Entry{ .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 }, // SSE4.1 + .{ .blendpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .none, .sse4_1 }, + + .{ .blendps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .none, .sse4_1 }, + + .{ .blendvpd, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x15 }, 0, .none, .sse4_1 }, + + .{ .blendvps, .rm0, &.{ .xmm, .xmm_m128, .xmm0 }, &.{ 0x66, 0x0f, 0x38, 0x14 }, 0, .none, .sse4_1 }, + .{ .extractps, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x17 }, 0, .none, .sse4_1 }, .{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 }, @@ -1146,11 +1158,33 @@ pub const table = [_]Entry{ .{ .vandps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x54 }, 0, .vex_128_wig, .avx }, .{ .vandps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x54 }, 0, .vex_256_wig, .avx }, + .{ .vblendpd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .vex_128_wig, .avx }, + .{ .vblendpd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0d }, 0, .vex_256_wig, .avx }, + + .{ .vblendps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .vex_128_wig, .avx }, + .{ .vblendps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x0c }, 0, .vex_256_wig, .avx }, + + .{ .vblendvpd, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4b }, 0, .vex_128_w0, .avx }, + .{ .vblendvpd, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4b }, 0, .vex_256_w0, .avx }, + + .{ .vblendvps, .rvmr, &.{ .xmm, .xmm, .xmm_m128, .xmm }, &.{ 0x66, 0x0f, 0x3a, 0x4a }, 0, .vex_128_w0, .avx }, + .{ .vblendvps, .rvmr, &.{ .ymm, .ymm, .ymm_m256, .ymm }, &.{ 0x66, 0x0f, 0x3a, 0x4a }, 0, .vex_256_w0, .avx }, + .{ .vbroadcastss, .rm, &.{ .xmm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_128_w0, .avx }, .{ .vbroadcastss, .rm, &.{ .ymm, .m32 }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx }, .{ .vbroadcastsd, .rm, &.{ .ymm, .m64 }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx }, .{ .vbroadcastf128, .rm, &.{ .ymm, .m128 }, &.{ 0x66, 0x0f, 0x38, 0x1a }, 0, .vex_256_w0, .avx }, + .{ .vcmppd, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0xc2 }, 0, .vex_128_wig, .avx }, + .{ .vcmppd, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x66, 0x0f, 0xc2 }, 0, .vex_256_wig, .avx }, + + .{ .vcmpps, .rvmi, &.{ .xmm, .xmm, .xmm_m128, .imm8 }, &.{ 0x0f, 0xc2 }, 0, .vex_128_wig, .avx }, + .{ .vcmpps, .rvmi, &.{ .ymm, .ymm, .ymm_m256, .imm8 }, &.{ 0x0f, 0xc2 }, 0, .vex_256_wig, .avx }, + + .{ .vcmpsd, .rvmi, &.{ .xmm, .xmm, .xmm_m64, .imm8 }, &.{ 0xf2, 0x0f, 0xc2 }, 0, .vex_lig_wig, .avx }, + + .{ .vcmpss, .rvmi, &.{ .xmm, .xmm, .xmm_m32, .imm8 }, &.{ 0xf3, 0x0f, 0xc2 }, 0, .vex_lig_wig, .avx }, + .{ .vcvtdq2pd, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .vex_128_wig, .avx }, .{ .vcvtdq2pd, .rm, &.{ .ymm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0xe6 }, 0, .vex_256_wig, .avx }, diff --git a/test/behavior/maximum_minimum.zig b/test/behavior/maximum_minimum.zig index ecfe596760..db6cad221f 100644 --- a/test/behavior/maximum_minimum.zig +++ b/test/behavior/maximum_minimum.zig @@ -24,7 +24,8 @@ test "@max" { test "@max on vectors" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO @@ -72,7 +73,8 @@ test "@min" { test "@min for vectors" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; // TODO - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and + !comptime std.Target.x86.featureSetHas(builtin.cpu.features, .sse4_1)) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; // TODO if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From 35da95fe8765874a1ccffb0d7bfd523b14f44a4a Mon Sep 17 00:00:00 2001 From: Jacob Young Date: Wed, 17 May 2023 00:23:11 -0400 Subject: x86_64: implement integer vector `@truncate` --- src/arch/x86_64/CodeGen.zig | 124 +++++++++++++++++++++++++++++++++++------- src/arch/x86_64/Encoding.zig | 3 + src/arch/x86_64/Mir.zig | 8 +++ src/arch/x86_64/encodings.zig | 21 +++++++ test/behavior/truncate.zig | 1 - 5 files changed, 136 insertions(+), 21 deletions(-) (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index bdcbed2629..a258f732f0 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2709,28 +2709,112 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; const dst_ty = self.air.typeOfIndex(inst); - const dst_abi_size = dst_ty.abiSize(self.target.*); - if (dst_abi_size > 8) { - return self.fail("TODO implement trunc for abi sizes larger than 8", .{}); - } + const dst_abi_size = @intCast(u32, dst_ty.abiSize(self.target.*)); + const src_ty = self.air.typeOf(ty_op.operand); + const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*)); - const src_mcv = try self.resolveInst(ty_op.operand); - const src_lock = switch (src_mcv) { - .register => |reg| self.register_manager.lockRegAssumeUnused(reg), - else => null, - }; - defer if (src_lock) |lock| self.register_manager.unlockReg(lock); + const result = result: { + const src_mcv = try self.resolveInst(ty_op.operand); + const src_lock = + if (src_mcv.getReg()) |reg| self.register_manager.lockRegAssumeUnused(reg) else null; + defer if (src_lock) |lock| self.register_manager.unlockReg(lock); - const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) - src_mcv - else - try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); + const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + src_mcv + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); + + if (dst_ty.zigTypeTag() == .Vector) { + assert(src_ty.zigTypeTag() == .Vector and dst_ty.vectorLen() == src_ty.vectorLen()); + const dst_info = dst_ty.childType().intInfo(self.target.*); + const src_info = src_ty.childType().intInfo(self.target.*); + const mir_tag = if (@as(?Mir.Inst.FixedTag, switch (dst_info.bits) { + 8 => switch (src_info.bits) { + 16 => switch (dst_ty.vectorLen()) { + 1...8 => if (self.hasFeature(.avx)) .{ .vp_b, .ackusw } else .{ .p_b, .ackusw }, + 9...16 => if (self.hasFeature(.avx2)) .{ .vp_b, .ackusw } else null, + else => null, + }, + else => null, + }, + 16 => switch (src_info.bits) { + 32 => switch (dst_ty.vectorLen()) { + 1...4 => if (self.hasFeature(.avx)) + .{ .vp_w, .ackusd } + else if (self.hasFeature(.sse4_1)) + .{ .p_w, .ackusd } + else + null, + 5...8 => if (self.hasFeature(.avx2)) .{ .vp_w, .ackusd } else null, + else => null, + }, + else => null, + }, + else => null, + })) |tag| tag else return self.fail("TODO implement airTrunc for {}", .{ + dst_ty.fmt(self.bin_file.options.module.?), + }); - // when truncating a `u16` to `u5`, for example, those top 3 bits in the result - // have to be removed. this only happens if the dst if not a power-of-two size. - if (self.regExtraBits(dst_ty) > 0) try self.truncateRegister(dst_ty, dst_mcv.register.to64()); + var mask_pl = Value.Payload.U64{ + .base = .{ .tag = .int_u64 }, + .data = @as(u64, math.maxInt(u64)) >> @intCast(u6, 64 - dst_info.bits), + }; + const mask_val = Value.initPayload(&mask_pl.base); - return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); + var splat_pl = Value.Payload.SubValue{ + .base = .{ .tag = .repeated }, + .data = mask_val, + }; + const splat_val = Value.initPayload(&splat_pl.base); + + var full_pl = Type.Payload.Array{ + .base = .{ .tag = .vector }, + .data = .{ + .len = @divExact(@as(u64, if (src_abi_size > 16) 256 else 128), src_info.bits), + .elem_type = src_ty.childType(), + }, + }; + const full_ty = Type.initPayload(&full_pl.base); + const full_abi_size = @intCast(u32, full_ty.abiSize(self.target.*)); + + const splat_mcv = try self.genTypedValue(.{ .ty = full_ty, .val = splat_val }); + const splat_addr_mcv: MCValue = switch (splat_mcv) { + .memory, .indirect, .load_frame => splat_mcv.address(), + else => .{ .register = try self.copyToTmpRegister(Type.usize, splat_mcv.address()) }, + }; + + const dst_reg = registerAlias(dst_mcv.getReg().?, src_abi_size); + if (self.hasFeature(.avx)) { + try self.asmRegisterRegisterMemory( + .{ .vp_, .@"and" }, + dst_reg, + dst_reg, + splat_addr_mcv.deref().mem(Memory.PtrSize.fromSize(full_abi_size)), + ); + try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg); + } else { + try self.asmRegisterMemory( + .{ .p_, .@"and" }, + dst_reg, + splat_addr_mcv.deref().mem(Memory.PtrSize.fromSize(full_abi_size)), + ); + try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg); + } + break :result dst_mcv; + } + + if (dst_abi_size > 8) { + return self.fail("TODO implement trunc for abi sizes larger than 8", .{}); + } + + // when truncating a `u16` to `u5`, for example, those top 3 bits in the result + // have to be removed. this only happens if the dst if not a power-of-two size. + if (self.regExtraBits(dst_ty) > 0) + try self.truncateRegister(dst_ty, dst_mcv.register.to64()); + + break :result dst_mcv; + }; + return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } fn airBoolToInt(self: *Self, inst: Air.Inst.Index) !void { @@ -11081,8 +11165,8 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void { } fn airShuffle(self: *Self, inst: Air.Inst.Index) !void { - const ty_op = self.air.instructions.items(.data)[inst].ty_op; - _ = ty_op; + const ty_pl = self.air.instructions.items(.data)[inst].ty_pl; + _ = ty_pl; return self.fail("TODO implement airShuffle for x86_64", .{}); //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 0aaf12013d..6ed0aeeff4 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -263,6 +263,7 @@ pub const Mnemonic = enum { fisttp, fld, // MMX movd, movq, + packssdw, packsswb, packuswb, paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw, pand, pandn, por, pxor, pmulhw, pmullw, @@ -319,6 +320,7 @@ pub const Mnemonic = enum { blendpd, blendps, blendvpd, blendvps, extractps, insertps, + packusdw, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw, @@ -351,6 +353,7 @@ pub const Mnemonic = enum { vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, vorpd, vorps, + vpackssdw, vpacksswb, vpackusdw, vpackuswb, vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw, vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 9f59a2afba..96b7742929 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -446,6 +446,12 @@ pub const Inst = struct { /// Bitwise logical xor of packed double-precision floating-point values xor, + /// Pack with signed saturation + ackssw, + /// Pack with signed saturation + ackssd, + /// Pack with unsigned saturation + ackusw, /// Add packed signed integers with signed saturation adds, /// Add packed unsigned integers with unsigned saturation @@ -596,6 +602,8 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, + /// Pack with unsigned saturation + ackusd, /// Blend packed single-precision floating-point values /// Blend scalar single-precision floating-point values /// Blend packed double-precision floating-point values diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index e087f6dfc7..a0cd1af0a7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -996,6 +996,11 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, + .{ .packsswb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x63 }, 0, .none, .sse2 }, + .{ .packssdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6b }, 0, .none, .sse2 }, + + .{ .packuswb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x67 }, 0, .none, .sse2 }, + .{ .paddb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .none, .sse2 }, .{ .paddw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .none, .sse2 }, .{ .paddd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .none, .sse2 }, @@ -1101,6 +1106,8 @@ pub const table = [_]Entry{ .{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 }, + .{ .packusdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .none, .sse4_1 }, + .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 }, .{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 }, .{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 }, @@ -1346,6 +1353,13 @@ pub const table = [_]Entry{ .{ .vorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .vex_128_wig, .avx }, .{ .vorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x56 }, 0, .vex_256_wig, .avx }, + .{ .vpacksswb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x63 }, 0, .vex_128_wig, .avx }, + .{ .vpackssdw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6b }, 0, .vex_128_wig, .avx }, + + .{ .vpackusdw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .vex_128_wig, .avx }, + + .{ .vpackuswb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x67 }, 0, .vex_128_wig, .avx }, + .{ .vpaddb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_128_wig, .avx }, .{ .vpaddw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_128_wig, .avx }, .{ .vpaddd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_128_wig, .avx }, @@ -1508,6 +1522,13 @@ pub const table = [_]Entry{ .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vpacksswb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x63 }, 0, .vex_256_wig, .avx2 }, + .{ .vpackssdw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6b }, 0, .vex_256_wig, .avx2 }, + + .{ .vpackusdw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .vex_256_wig, .avx2 }, + + .{ .vpackuswb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x67 }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_256_wig, .avx2 }, .{ .vpaddw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_256_wig, .avx2 }, .{ .vpaddd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_256_wig, .avx2 }, diff --git a/test/behavior/truncate.zig b/test/behavior/truncate.zig index 1db2f0280f..d3091487b4 100644 --- a/test/behavior/truncate.zig +++ b/test/behavior/truncate.zig @@ -61,7 +61,6 @@ test "truncate on comptime integer" { test "truncate on vectors" { if (builtin.zig_backend == .stage2_wasm) return error.SkipZigTest; - if (builtin.zig_backend == .stage2_x86_64) return error.SkipZigTest; if (builtin.zig_backend == .stage2_aarch64) return error.SkipZigTest; if (builtin.zig_backend == .stage2_arm) return error.SkipZigTest; if (builtin.zig_backend == .stage2_sparc64) return error.SkipZigTest; // TODO -- cgit v1.2.3 From 3db3cf77904e664d589287602c14168a7a63f125 Mon Sep 17 00:00:00 2001 From: Ali Chraghi Date: Tue, 23 May 2023 15:33:12 +0330 Subject: std.sort: add pdqsort and heapsort --- lib/std/compress/deflate/huffman_code.zig | 4 +- lib/std/compress/zstandard/decode/fse.zig | 2 +- lib/std/compress/zstandard/decode/huffman.zig | 2 +- lib/std/comptime_string_map.zig | 2 +- lib/std/debug.zig | 2 +- lib/std/enums.zig | 2 +- lib/std/http/Headers.zig | 2 +- lib/std/mem.zig | 28 + lib/std/meta.zig | 2 +- lib/std/multi_array_list.zig | 7 +- lib/std/net.zig | 2 +- lib/std/sort.zig | 1471 ++++--------------------- lib/std/sort/block.zig | 1066 ++++++++++++++++++ lib/std/sort/pdq.zig | 331 ++++++ src/Compilation.zig | 2 +- src/Package.zig | 2 +- src/RangeSet.zig | 2 +- src/Sema.zig | 2 +- src/arch/x86_64/CodeGen.zig | 2 +- src/arch/x86_64/Encoding.zig | 2 +- src/codegen/c/type.zig | 2 +- src/link/Coff.zig | 2 +- src/link/MachO/Object.zig | 8 +- src/link/MachO/UnwindInfo.zig | 2 +- src/link/MachO/dyld_info/Rebase.zig | 2 +- src/link/MachO/dyld_info/bind.zig | 2 +- src/link/MachO/zld.zig | 4 +- src/link/Wasm.zig | 10 +- src/objcopy.zig | 4 +- test/src/Cases.zig | 2 +- tools/gen_stubs.zig | 2 +- tools/generate_JSONTestSuite.zig | 2 +- tools/process_headers.zig | 2 +- tools/update-linux-headers.zig | 2 +- tools/update_clang_options.zig | 2 +- tools/update_cpu_features.zig | 8 +- tools/update_spirv_features.zig | 2 +- 37 files changed, 1702 insertions(+), 1291 deletions(-) create mode 100644 lib/std/sort/block.zig create mode 100644 lib/std/sort/pdq.zig (limited to 'src/arch/x86_64/Encoding.zig') diff --git a/lib/std/compress/deflate/huffman_code.zig b/lib/std/compress/deflate/huffman_code.zig index 4827feb245..689ac1441a 100644 --- a/lib/std/compress/deflate/huffman_code.zig +++ b/lib/std/compress/deflate/huffman_code.zig @@ -93,7 +93,7 @@ pub const HuffmanEncoder = struct { return; } self.lfs = list; - sort.sort(LiteralNode, self.lfs, {}, byFreq); + mem.sort(LiteralNode, self.lfs, {}, byFreq); // Get the number of literals for each bit count var bit_count = self.bitCounts(list, max_bits); @@ -270,7 +270,7 @@ pub const HuffmanEncoder = struct { var chunk = list[list.len - @intCast(u32, bits) ..]; self.lns = chunk; - sort.sort(LiteralNode, self.lns, {}, byLiteral); + mem.sort(LiteralNode, self.lns, {}, byLiteral); for (chunk) |node| { self.codes[node.literal] = HuffCode{ diff --git a/lib/std/compress/zstandard/decode/fse.zig b/lib/std/compress/zstandard/decode/fse.zig index 741fd81ccc..232af39ccf 100644 --- a/lib/std/compress/zstandard/decode/fse.zig +++ b/lib/std/compress/zstandard/decode/fse.zig @@ -107,7 +107,7 @@ fn buildFseTable(values: []const u16, entries: []Table.Fse) !void { position &= entries.len - 1; } } - std.sort.sort(u16, temp_states[0..probability], {}, std.sort.asc(u16)); + std.mem.sort(u16, temp_states[0..probability], {}, std.sort.asc(u16)); for (0..probability) |i| { entries[temp_states[i]] = if (i < double_state_count) Table.Fse{ .symbol = @intCast(u8, symbol), diff --git a/lib/std/compress/zstandard/decode/huffman.zig b/lib/std/compress/zstandard/decode/huffman.zig index 2914198268..f5e977d0da 100644 --- a/lib/std/compress/zstandard/decode/huffman.zig +++ b/lib/std/compress/zstandard/decode/huffman.zig @@ -124,7 +124,7 @@ fn assignSymbols(weight_sorted_prefixed_symbols: []LiteralsSection.HuffmanTree.P }; } - std.sort.sort( + std.mem.sort( LiteralsSection.HuffmanTree.PrefixedSymbol, weight_sorted_prefixed_symbols, weights, diff --git a/lib/std/comptime_string_map.zig b/lib/std/comptime_string_map.zig index 7620ec7af8..e6859c32c1 100644 --- a/lib/std/comptime_string_map.zig +++ b/lib/std/comptime_string_map.zig @@ -28,7 +28,7 @@ pub fn ComptimeStringMap(comptime V: type, comptime kvs_list: anytype) type { sorted_kvs[i] = .{ .key = kv.@"0", .value = {} }; } } - std.sort.sort(KV, &sorted_kvs, {}, lenAsc); + mem.sort(KV, &sorted_kvs, {}, lenAsc); const min_len = sorted_kvs[0].key.len; const max_len = sorted_kvs[sorted_kvs.len - 1].key.len; var len_indexes: [max_len + 1]usize = undefined; diff --git a/lib/std/debug.zig b/lib/std/debug.zig index ecc1a9f0cf..005c2b5404 100644 --- a/lib/std/debug.zig +++ b/lib/std/debug.zig @@ -1211,7 +1211,7 @@ fn readMachODebugInfo(allocator: mem.Allocator, macho_file: File) !ModuleDebugIn // Even though lld emits symbols in ascending order, this debug code // should work for programs linked in any valid way. // This sort is so that we can binary search later. - std.sort.sort(MachoSymbol, symbols, {}, MachoSymbol.addressLessThan); + mem.sort(MachoSymbol, symbols, {}, MachoSymbol.addressLessThan); return ModuleDebugInfo{ .base_address = undefined, diff --git a/lib/std/enums.zig b/lib/std/enums.zig index aa6edd60b1..757c616b9b 100644 --- a/lib/std/enums.zig +++ b/lib/std/enums.zig @@ -1314,7 +1314,7 @@ pub fn EnumIndexer(comptime E: type) type { } }; } - std.sort.sort(EnumField, &fields, {}, ascByValue); + std.mem.sort(EnumField, &fields, {}, ascByValue); const min = fields[0].value; const max = fields[fields.len - 1].value; const fields_len = fields.len; diff --git a/lib/std/http/Headers.zig b/lib/std/http/Headers.zig index 429df9368a..fb7a9360d8 100644 --- a/lib/std/http/Headers.zig +++ b/lib/std/http/Headers.zig @@ -191,7 +191,7 @@ pub const Headers = struct { /// Sorts the headers in lexicographical order. pub fn sort(headers: *Headers) void { - std.sort.sort(Field, headers.list.items, {}, Field.lessThan); + std.mem.sort(Field, headers.list.items, {}, Field.lessThan); headers.rebuildIndex(); } diff --git a/lib/std/mem.zig b/lib/std/mem.zig index 8cb2c00a3a..311c97c254 100644 --- a/lib/std/mem.zig +++ b/lib/std/mem.zig @@ -566,6 +566,34 @@ test "zeroInit" { }, nested_baz); } +pub fn sort( + comptime T: type, + items: []T, + context: anytype, + comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + std.sort.block(T, items, context, lessThanFn); +} + +pub fn sortUnstable( + comptime T: type, + items: []T, + context: anytype, + comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + std.sort.pdq(T, items, context, lessThanFn); +} + +/// TODO: currently this just calls `insertionSortContext`. The block sort implementation +/// in this file needs to be adapted to use the sort context. +pub fn sortContext(a: usize, b: usize, context: anytype) void { + std.sort.insertionContext(a, b, context); +} + +pub fn sortUnstableContext(a: usize, b: usize, context: anytype) void { + std.sort.pdqContext(a, b, context); +} + /// Compares two slices of numbers lexicographically. O(n). pub fn order(comptime T: type, lhs: []const T, rhs: []const T) math.Order { const n = math.min(lhs.len, rhs.len); diff --git a/lib/std/meta.zig b/lib/std/meta.zig index 8adba2439a..d0b07b934f 100644 --- a/lib/std/meta.zig +++ b/lib/std/meta.zig @@ -985,7 +985,7 @@ pub fn declList(comptime Namespace: type, comptime Decl: type) []const *const De for (decls, 0..) |decl, i| { array[i] = &@field(Namespace, decl.name); } - std.sort.sort(*const Decl, &array, {}, S.declNameLessThan); + mem.sort(*const Decl, &array, {}, S.declNameLessThan); return &array; } } diff --git a/lib/std/multi_array_list.zig b/lib/std/multi_array_list.zig index 322471bedf..44e226be33 100644 --- a/lib/std/multi_array_list.zig +++ b/lib/std/multi_array_list.zig @@ -160,7 +160,7 @@ pub fn MultiArrayList(comptime T: type) type { return lhs.alignment > rhs.alignment; } }; - std.sort.sort(Data, &data, {}, Sort.lessThan); + mem.sort(Data, &data, {}, Sort.lessThan); var sizes_bytes: [fields.len]usize = undefined; var field_indexes: [fields.len]usize = undefined; for (data, 0..) |elem, i| { @@ -488,10 +488,7 @@ pub fn MultiArrayList(comptime T: type) type { } }; - std.sort.sortContext(self.len, SortContext{ - .sub_ctx = ctx, - .slice = self.slice(), - }); + mem.sortContext(0, self.len, SortContext{ .sub_ctx = ctx, .slice = self.slice() }); } fn capacityInBytes(capacity: usize) usize { diff --git a/lib/std/net.zig b/lib/std/net.zig index 57e50a7349..7629ecc8f7 100644 --- a/lib/std/net.zig +++ b/lib/std/net.zig @@ -1082,7 +1082,7 @@ fn linuxLookupName( key |= (MAXADDRS - @intCast(i32, i)) << DAS_ORDER_SHIFT; addr.sortkey = key; } - std.sort.sort(LookupAddr, addrs.items, {}, addrCmpLessThan); + mem.sort(LookupAddr, addrs.items, {}, addrCmpLessThan); } const Policy = struct { diff --git a/lib/std/sort.zig b/lib/std/sort.zig index 3e219b8566..bf2bf40f89 100644 --- a/lib/std/sort.zig +++ b/lib/std/sort.zig @@ -4,1241 +4,152 @@ const testing = std.testing; const mem = std.mem; const math = std.math; -pub fn binarySearch( - comptime T: type, - key: anytype, - items: []const T, - context: anytype, - comptime compareFn: fn (context: @TypeOf(context), key: @TypeOf(key), mid_item: T) math.Order, -) ?usize { - var left: usize = 0; - var right: usize = items.len; - - while (left < right) { - // Avoid overflowing in the midpoint calculation - const mid = left + (right - left) / 2; - // Compare the key with the midpoint element - switch (compareFn(context, key, items[mid])) { - .eq => return mid, - .gt => left = mid + 1, - .lt => right = mid, - } - } - - return null; -} - -test "binarySearch" { - const S = struct { - fn order_u32(context: void, lhs: u32, rhs: u32) math.Order { - _ = context; - return math.order(lhs, rhs); - } - fn order_i32(context: void, lhs: i32, rhs: i32) math.Order { - _ = context; - return math.order(lhs, rhs); - } - }; - try testing.expectEqual( - @as(?usize, null), - binarySearch(u32, @as(u32, 1), &[_]u32{}, {}, S.order_u32), - ); - try testing.expectEqual( - @as(?usize, 0), - binarySearch(u32, @as(u32, 1), &[_]u32{1}, {}, S.order_u32), - ); - try testing.expectEqual( - @as(?usize, null), - binarySearch(u32, @as(u32, 1), &[_]u32{0}, {}, S.order_u32), - ); - try testing.expectEqual( - @as(?usize, null), - binarySearch(u32, @as(u32, 0), &[_]u32{1}, {}, S.order_u32), - ); - try testing.expectEqual( - @as(?usize, 4), - binarySearch(u32, @as(u32, 5), &[_]u32{ 1, 2, 3, 4, 5 }, {}, S.order_u32), - ); - try testing.expectEqual( - @as(?usize, 0), - binarySearch(u32, @as(u32, 2), &[_]u32{ 2, 4, 8, 16, 32, 64 }, {}, S.order_u32), - ); - try testing.expectEqual( - @as(?usize, 1), - binarySearch(i32, @as(i32, -4), &[_]i32{ -7, -4, 0, 9, 10 }, {}, S.order_i32), - ); - try testing.expectEqual( - @as(?usize, 3), - binarySearch(i32, @as(i32, 98), &[_]i32{ -100, -25, 2, 98, 99, 100 }, {}, S.order_i32), - ); - const R = struct { - b: i32, - e: i32, - - fn r(b: i32, e: i32) @This() { - return @This(){ .b = b, .e = e }; - } - - fn order(context: void, key: i32, mid_item: @This()) math.Order { - _ = context; - - if (key < mid_item.b) { - return .lt; - } - - if (key > mid_item.e) { - return .gt; - } - - return .eq; - } - }; - try testing.expectEqual( - @as(?usize, null), - binarySearch(R, @as(i32, -45), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order), - ); - try testing.expectEqual( - @as(?usize, 2), - binarySearch(R, @as(i32, 10), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order), - ); - try testing.expectEqual( - @as(?usize, 1), - binarySearch(R, @as(i32, -20), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order), - ); -} +pub const block = @import("sort/block.zig").block; +pub const pdq = @import("sort/pdq.zig").pdq; +pub const pdqContext = @import("sort/pdq.zig").pdqContext; /// Stable in-place sort. O(n) best case, O(pow(n, 2)) worst case. /// O(1) memory (no allocator required). /// Sorts in ascending order with respect to the given `lessThan` function. -/// This can be expressed in terms of `insertionSortContext` but the glue -/// code is slightly longer than the direct implementation. -pub fn insertionSort( +pub fn insertion( comptime T: type, items: []T, context: anytype, - comptime lessThan: fn (context: @TypeOf(context), lhs: T, rhs: T) bool, + comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool, ) void { - var i: usize = 1; - while (i < items.len) : (i += 1) { - const x = items[i]; - var j: usize = i; - while (j > 0 and lessThan(context, x, items[j - 1])) : (j -= 1) { - items[j] = items[j - 1]; + const Context = struct { + items: []T, + sub_ctx: @TypeOf(context), + + pub fn lessThan(ctx: @This(), a: usize, b: usize) bool { + return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]); } - items[j] = x; - } + + pub fn swap(ctx: @This(), a: usize, b: usize) void { + return mem.swap(T, &ctx.items[a], &ctx.items[b]); + } + }; + insertionContext(0, items.len, Context{ .items = items, .sub_ctx = context }); } /// Stable in-place sort. O(n) best case, O(pow(n, 2)) worst case. /// O(1) memory (no allocator required). -/// Sorts in ascending order with respect to the given `context.lessThan` function. -pub fn insertionSortContext(len: usize, context: anytype) void { - var i: usize = 1; - while (i < len) : (i += 1) { - var j: usize = i; - while (j > 0 and context.lessThan(j, j - 1)) : (j -= 1) { +/// Sorts in ascending order with respect to the given `lessThan` function. +pub fn insertionContext(a: usize, b: usize, context: anytype) void { + var i = a + 1; + while (i < b) : (i += 1) { + var j = i; + while (j > a and context.lessThan(j, j - 1)) : (j -= 1) { context.swap(j, j - 1); } } } -const Range = struct { - start: usize, - end: usize, - - fn init(start: usize, end: usize) Range { - return Range{ - .start = start, - .end = end, - }; - } - - fn length(self: Range) usize { - return self.end - self.start; - } -}; - -const Iterator = struct { - size: usize, - power_of_two: usize, - numerator: usize, - decimal: usize, - denominator: usize, - decimal_step: usize, - numerator_step: usize, - - fn init(size2: usize, min_level: usize) Iterator { - const power_of_two = math.floorPowerOfTwo(usize, size2); - const denominator = power_of_two / min_level; - return Iterator{ - .numerator = 0, - .decimal = 0, - .size = size2, - .power_of_two = power_of_two, - .denominator = denominator, - .decimal_step = size2 / denominator, - .numerator_step = size2 % denominator, - }; - } - - fn begin(self: *Iterator) void { - self.numerator = 0; - self.decimal = 0; - } - - fn nextRange(self: *Iterator) Range { - const start = self.decimal; - - self.decimal += self.decimal_step; - self.numerator += self.numerator_step; - if (self.numerator >= self.denominator) { - self.numerator -= self.denominator; - self.decimal += 1; - } - - return Range{ - .start = start, - .end = self.decimal, - }; - } - - fn finished(self: *Iterator) bool { - return self.decimal >= self.size; - } - - fn nextLevel(self: *Iterator) bool { - self.decimal_step += self.decimal_step; - self.numerator_step += self.numerator_step; - if (self.numerator_step >= self.denominator) { - self.numerator_step -= self.denominator; - self.decimal_step += 1; - } - - return (self.decimal_step < self.size); - } - - fn length(self: *Iterator) usize { - return self.decimal_step; - } -}; - -const Pull = struct { - from: usize, - to: usize, - count: usize, - range: Range, -}; - -/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case. +/// Unstable in-place sort. O(n*log(n)) best case, worst case and average case. /// O(1) memory (no allocator required). /// Sorts in ascending order with respect to the given `lessThan` function. -/// Currently implemented as block sort. -pub fn sort( +pub fn heap( comptime T: type, items: []T, context: anytype, - comptime lessThan: fn (context: @TypeOf(context), lhs: T, rhs: T) bool, + comptime lessThanFn: fn (@TypeOf(context), lhs: T, rhs: T) bool, ) void { + const Context = struct { + items: []T, + sub_ctx: @TypeOf(context), - // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c - var cache: [512]T = undefined; - - if (items.len < 4) { - if (items.len == 3) { - // hard coded insertion sort - if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]); - if (lessThan(context, items[2], items[1])) { - mem.swap(T, &items[1], &items[2]); - if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]); - } - } else if (items.len == 2) { - if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]); + pub fn lessThan(ctx: @This(), a: usize, b: usize) bool { + return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]); } - return; - } - - // sort groups of 4-8 items at a time using an unstable sorting network, - // but keep track of the original item orders to force it to be stable - // http://pages.ripco.net/~jgamble/nw.html - var iterator = Iterator.init(items.len, 4); - while (!iterator.finished()) { - var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 }; - const range = iterator.nextRange(); - - const sliced_items = items[range.start..]; - switch (range.length()) { - 8 => { - swap(T, sliced_items, context, lessThan, &order, 0, 1); - swap(T, sliced_items, context, lessThan, &order, 2, 3); - swap(T, sliced_items, context, lessThan, &order, 4, 5); - swap(T, sliced_items, context, lessThan, &order, 6, 7); - swap(T, sliced_items, context, lessThan, &order, 0, 2); - swap(T, sliced_items, context, lessThan, &order, 1, 3); - swap(T, sliced_items, context, lessThan, &order, 4, 6); - swap(T, sliced_items, context, lessThan, &order, 5, 7); - swap(T, sliced_items, context, lessThan, &order, 1, 2); - swap(T, sliced_items, context, lessThan, &order, 5, 6); - swap(T, sliced_items, context, lessThan, &order, 0, 4); - swap(T, sliced_items, context, lessThan, &order, 3, 7); - swap(T, sliced_items, context, lessThan, &order, 1, 5); - swap(T, sliced_items, context, lessThan, &order, 2, 6); - swap(T, sliced_items, context, lessThan, &order, 1, 4); - swap(T, sliced_items, context, lessThan, &order, 3, 6); - swap(T, sliced_items, context, lessThan, &order, 2, 4); - swap(T, sliced_items, context, lessThan, &order, 3, 5); - swap(T, sliced_items, context, lessThan, &order, 3, 4); - }, - 7 => { - swap(T, sliced_items, context, lessThan, &order, 1, 2); - swap(T, sliced_items, context, lessThan, &order, 3, 4); - swap(T, sliced_items, context, lessThan, &order, 5, 6); - swap(T, sliced_items, context, lessThan, &order, 0, 2); - swap(T, sliced_items, context, lessThan, &order, 3, 5); - swap(T, sliced_items, context, lessThan, &order, 4, 6); - swap(T, sliced_items, context, lessThan, &order, 0, 1); - swap(T, sliced_items, context, lessThan, &order, 4, 5); - swap(T, sliced_items, context, lessThan, &order, 2, 6); - swap(T, sliced_items, context, lessThan, &order, 0, 4); - swap(T, sliced_items, context, lessThan, &order, 1, 5); - swap(T, sliced_items, context, lessThan, &order, 0, 3); - swap(T, sliced_items, context, lessThan, &order, 2, 5); - swap(T, sliced_items, context, lessThan, &order, 1, 3); - swap(T, sliced_items, context, lessThan, &order, 2, 4); - swap(T, sliced_items, context, lessThan, &order, 2, 3); - }, - 6 => { - swap(T, sliced_items, context, lessThan, &order, 1, 2); - swap(T, sliced_items, context, lessThan, &order, 4, 5); - swap(T, sliced_items, context, lessThan, &order, 0, 2); - swap(T, sliced_items, context, lessThan, &order, 3, 5); - swap(T, sliced_items, context, lessThan, &order, 0, 1); - swap(T, sliced_items, context, lessThan, &order, 3, 4); - swap(T, sliced_items, context, lessThan, &order, 2, 5); - swap(T, sliced_items, context, lessThan, &order, 0, 3); - swap(T, sliced_items, context, lessThan, &order, 1, 4); - swap(T, sliced_items, context, lessThan, &order, 2, 4); - swap(T, sliced_items, context, lessThan, &order, 1, 3); - swap(T, sliced_items, context, lessThan, &order, 2, 3); - }, - 5 => { - swap(T, sliced_items, context, lessThan, &order, 0, 1); - swap(T, sliced_items, context, lessThan, &order, 3, 4); - swap(T, sliced_items, context, lessThan, &order, 2, 4); - swap(T, sliced_items, context, lessThan, &order, 2, 3); - swap(T, sliced_items, context, lessThan, &order, 1, 4); - swap(T, sliced_items, context, lessThan, &order, 0, 3); - swap(T, sliced_items, context, lessThan, &order, 0, 2); - swap(T, sliced_items, context, lessThan, &order, 1, 3); - swap(T, sliced_items, context, lessThan, &order, 1, 2); - }, - 4 => { - swap(T, sliced_items, context, lessThan, &order, 0, 1); - swap(T, sliced_items, context, lessThan, &order, 2, 3); - swap(T, sliced_items, context, lessThan, &order, 0, 2); - swap(T, sliced_items, context, lessThan, &order, 1, 3); - swap(T, sliced_items, context, lessThan, &order, 1, 2); - }, - else => {}, - } - } - if (items.len < 8) return; - - // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc. - while (true) { - // if every A and B block will fit into the cache, use a special branch - // specifically for merging with the cache - // (we use < rather than <= since the block size might be one more than - // iterator.length()) - if (iterator.length() < cache.len) { - // if four subarrays fit into the cache, it's faster to merge both - // pairs of subarrays into the cache, - // then merge the two merged subarrays from the cache back into the original array - if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) { - iterator.begin(); - while (!iterator.finished()) { - // merge A1 and B1 into the cache - var A1 = iterator.nextRange(); - var B1 = iterator.nextRange(); - var A2 = iterator.nextRange(); - var B2 = iterator.nextRange(); - - if (lessThan(context, items[B1.end - 1], items[A1.start])) { - // the two ranges are in reverse order, so copy them in reverse order into the cache - const a1_items = items[A1.start..A1.end]; - @memcpy(cache[B1.length()..][0..a1_items.len], a1_items); - const b1_items = items[B1.start..B1.end]; - @memcpy(cache[0..b1_items.len], b1_items); - } else if (lessThan(context, items[B1.start], items[A1.end - 1])) { - // these two ranges weren't already in order, so merge them into the cache - mergeInto(T, items, A1, B1, context, lessThan, cache[0..]); - } else { - // if A1, B1, A2, and B2 are all in order, skip doing anything else - if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue; - - // copy A1 and B1 into the cache in the same order - const a1_items = items[A1.start..A1.end]; - @memcpy(cache[0..a1_items.len], a1_items); - const b1_items = items[B1.start..B1.end]; - @memcpy(cache[A1.length()..][0..b1_items.len], b1_items); - } - A1 = Range.init(A1.start, B1.end); - - // merge A2 and B2 into the cache - if (lessThan(context, items[B2.end - 1], items[A2.start])) { - // the two ranges are in reverse order, so copy them in reverse order into the cache - const a2_items = items[A2.start..A2.end]; - @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items); - const b2_items = items[B2.start..B2.end]; - @memcpy(cache[A1.length()..][0..b2_items.len], b2_items); - } else if (lessThan(context, items[B2.start], items[A2.end - 1])) { - // these two ranges weren't already in order, so merge them into the cache - mergeInto(T, items, A2, B2, context, lessThan, cache[A1.length()..]); - } else { - // copy A2 and B2 into the cache in the same order - const a2_items = items[A2.start..A2.end]; - @memcpy(cache[A1.length()..][0..a2_items.len], a2_items); - const b2_items = items[B2.start..B2.end]; - @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items); - } - A2 = Range.init(A2.start, B2.end); - - // merge A1 and A2 from the cache into the items - const A3 = Range.init(0, A1.length()); - const B3 = Range.init(A1.length(), A1.length() + A2.length()); - - if (lessThan(context, cache[B3.end - 1], cache[A3.start])) { - // the two ranges are in reverse order, so copy them in reverse order into the items - const a3_items = cache[A3.start..A3.end]; - @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items); - const b3_items = cache[B3.start..B3.end]; - @memcpy(items[A1.start..][0..b3_items.len], b3_items); - } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) { - // these two ranges weren't already in order, so merge them back into the items - mergeInto(T, cache[0..], A3, B3, context, lessThan, items[A1.start..]); - } else { - // copy A3 and B3 into the items in the same order - const a3_items = cache[A3.start..A3.end]; - @memcpy(items[A1.start..][0..a3_items.len], a3_items); - const b3_items = cache[B3.start..B3.end]; - @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items); - } - } - - // we merged two levels at the same time, so we're done with this level already - // (iterator.nextLevel() is called again at the bottom of this outer merge loop) - _ = iterator.nextLevel(); - } else { - iterator.begin(); - while (!iterator.finished()) { - var A = iterator.nextRange(); - var B = iterator.nextRange(); - - if (lessThan(context, items[B.end - 1], items[A.start])) { - // the two ranges are in reverse order, so a simple rotation should fix it - mem.rotate(T, items[A.start..B.end], A.length()); - } else if (lessThan(context, items[B.start], items[A.end - 1])) { - // these two ranges weren't already in order, so we'll need to merge them! - const a_items = items[A.start..A.end]; - @memcpy(cache[0..a_items.len], a_items); - mergeExternal(T, items, A, B, context, lessThan, cache[0..]); - } - } - } - } else { - // this is where the in-place merge logic starts! - // 1. pull out two internal buffers each containing √A unique values - // 1a. adjust block_size and buffer_size if we couldn't find enough unique values - // 2. loop over the A and B subarrays within this level of the merge sort - // 3. break A and B into blocks of size 'block_size' - // 4. "tag" each of the A blocks with values from the first internal buffer - // 5. roll the A blocks through the B blocks and drop/rotate them where they belong - // 6. merge each A block with any B values that follow, using the cache or the second internal buffer - // 7. sort the second internal buffer if it exists - // 8. redistribute the two internal buffers back into the items - var block_size: usize = math.sqrt(iterator.length()); - var buffer_size = iterator.length() / block_size + 1; - - // as an optimization, we really only need to pull out the internal buffers once for each level of merges - // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level - var A: Range = undefined; - var B: Range = undefined; - var index: usize = 0; - var last: usize = 0; - var count: usize = 0; - var find: usize = 0; - var start: usize = 0; - var pull_index: usize = 0; - var pull = [_]Pull{ - Pull{ - .from = 0, - .to = 0, - .count = 0, - .range = Range.init(0, 0), - }, - Pull{ - .from = 0, - .to = 0, - .count = 0, - .range = Range.init(0, 0), - }, - }; - - var buffer1 = Range.init(0, 0); - var buffer2 = Range.init(0, 0); - - // find two internal buffers of size 'buffer_size' each - find = buffer_size + buffer_size; - var find_separately = false; - - if (block_size <= cache.len) { - // if every A block fits into the cache then we won't need the second internal buffer, - // so we really only need to find 'buffer_size' unique values - find = buffer_size; - } else if (find > iterator.length()) { - // we can't fit both buffers into the same A or B subarray, so find two buffers separately - find = buffer_size; - find_separately = true; - } - - // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each), - // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values, - // OR if we couldn't find that many unique values, we need the largest possible buffer we can get - - // in the case where it couldn't find a single buffer of at least √A unique values, - // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace) - iterator.begin(); - while (!iterator.finished()) { - A = iterator.nextRange(); - B = iterator.nextRange(); - - // just store information about where the values will be pulled from and to, - // as well as how many values there are, to create the two internal buffers - - // check A for the number of unique values we need to fill an internal buffer - // these values will be pulled out to the start of A - last = A.start; - count = 1; - while (count < find) : ({ - last = index; - count += 1; - }) { - index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), context, lessThan, find - count); - if (index == A.end) break; - } - index = last; - - if (count >= buffer_size) { - // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer - pull[pull_index] = Pull{ - .range = Range.init(A.start, B.end), - .count = count, - .from = index, - .to = A.start, - }; - pull_index = 1; - - if (count == buffer_size + buffer_size) { - // we were able to find a single contiguous section containing 2√A unique values, - // so this section can be used to contain both of the internal buffers we'll need - buffer1 = Range.init(A.start, A.start + buffer_size); - buffer2 = Range.init(A.start + buffer_size, A.start + count); - break; - } else if (find == buffer_size + buffer_size) { - // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values, - // so we still need to find a second separate buffer of at least √A unique values - buffer1 = Range.init(A.start, A.start + count); - find = buffer_size; - } else if (block_size <= cache.len) { - // we found the first and only internal buffer that we need, so we're done! - buffer1 = Range.init(A.start, A.start + count); - break; - } else if (find_separately) { - // found one buffer, but now find the other one - buffer1 = Range.init(A.start, A.start + count); - find_separately = false; - } else { - // we found a second buffer in an 'A' subarray containing √A unique values, so we're done! - buffer2 = Range.init(A.start, A.start + count); - break; - } - } else if (pull_index == 0 and count > buffer1.length()) { - // keep track of the largest buffer we were able to find - buffer1 = Range.init(A.start, A.start + count); - pull[pull_index] = Pull{ - .range = Range.init(A.start, B.end), - .count = count, - .from = index, - .to = A.start, - }; - } - - // check B for the number of unique values we need to fill an internal buffer - // these values will be pulled out to the end of B - last = B.end - 1; - count = 1; - while (count < find) : ({ - last = index - 1; - count += 1; - }) { - index = findFirstBackward(T, items, items[last], Range.init(B.start, last), context, lessThan, find - count); - if (index == B.start) break; - } - index = last; - if (count >= buffer_size) { - // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe - pull[pull_index] = Pull{ - .range = Range.init(A.start, B.end), - .count = count, - .from = index, - .to = B.end, - }; - pull_index = 1; - - if (count == buffer_size + buffer_size) { - // we were able to find a single contiguous section containing 2√A unique values, - // so this section can be used to contain both of the internal buffers we'll need - buffer1 = Range.init(B.end - count, B.end - buffer_size); - buffer2 = Range.init(B.end - buffer_size, B.end); - break; - } else if (find == buffer_size + buffer_size) { - // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values, - // so we still need to find a second separate buffer of at least √A unique values - buffer1 = Range.init(B.end - count, B.end); - find = buffer_size; - } else if (block_size <= cache.len) { - // we found the first and only internal buffer that we need, so we're done! - buffer1 = Range.init(B.end - count, B.end); - break; - } else if (find_separately) { - // found one buffer, but now find the other one - buffer1 = Range.init(B.end - count, B.end); - find_separately = false; - } else { - // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray, - // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2 - if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count; - - // we found a second buffer in an 'B' subarray containing √A unique values, so we're done! - buffer2 = Range.init(B.end - count, B.end); - break; - } - } else if (pull_index == 0 and count > buffer1.length()) { - // keep track of the largest buffer we were able to find - buffer1 = Range.init(B.end - count, B.end); - pull[pull_index] = Pull{ - .range = Range.init(A.start, B.end), - .count = count, - .from = index, - .to = B.end, - }; - } - } - - // pull out the two ranges so we can use them as internal buffers - pull_index = 0; - while (pull_index < 2) : (pull_index += 1) { - const length = pull[pull_index].count; - - if (pull[pull_index].to < pull[pull_index].from) { - // we're pulling the values out to the left, which means the start of an A subarray - index = pull[pull_index].from; - count = 1; - while (count < length) : (count += 1) { - index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), context, lessThan, length - count); - const range = Range.init(index + 1, pull[pull_index].from + 1); - mem.rotate(T, items[range.start..range.end], range.length() - count); - pull[pull_index].from = index + count; - } - } else if (pull[pull_index].to > pull[pull_index].from) { - // we're pulling values out to the right, which means the end of a B subarray - index = pull[pull_index].from + 1; - count = 1; - while (count < length) : (count += 1) { - index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), context, lessThan, length - count); - const range = Range.init(pull[pull_index].from, index - 1); - mem.rotate(T, items[range.start..range.end], count); - pull[pull_index].from = index - 1 - count; - } - } - } - - // adjust block_size and buffer_size based on the values we were able to pull out - buffer_size = buffer1.length(); - block_size = iterator.length() / buffer_size + 1; - - // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks, - // so this was originally here to test the math for adjusting block_size above - // assert((iterator.length() + 1)/block_size <= buffer_size); - - // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort! - iterator.begin(); - while (!iterator.finished()) { - A = iterator.nextRange(); - B = iterator.nextRange(); - - // remove any parts of A or B that are being used by the internal buffers - start = A.start; - if (start == pull[0].range.start) { - if (pull[0].from > pull[0].to) { - A.start += pull[0].count; - - // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge - // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4, - // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal - if (A.length() == 0) continue; - } else if (pull[0].from < pull[0].to) { - B.end -= pull[0].count; - if (B.length() == 0) continue; - } - } - if (start == pull[1].range.start) { - if (pull[1].from > pull[1].to) { - A.start += pull[1].count; - if (A.length() == 0) continue; - } else if (pull[1].from < pull[1].to) { - B.end -= pull[1].count; - if (B.length() == 0) continue; - } - } - - if (lessThan(context, items[B.end - 1], items[A.start])) { - // the two ranges are in reverse order, so a simple rotation should fix it - mem.rotate(T, items[A.start..B.end], A.length()); - } else if (lessThan(context, items[A.end], items[A.end - 1])) { - // these two ranges weren't already in order, so we'll need to merge them! - var findA: usize = undefined; - - // break the remainder of A into blocks. firstA is the uneven-sized first A block - var blockA = Range.init(A.start, A.end); - var firstA = Range.init(A.start, A.start + blockA.length() % block_size); - - // swap the first value of each A block with the value in buffer1 - var indexA = buffer1.start; - index = firstA.end; - while (index < blockA.end) : ({ - indexA += 1; - index += block_size; - }) { - mem.swap(T, &items[indexA], &items[index]); - } - - // start rolling the A blocks through the B blocks! - // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well - var lastA = firstA; - var lastB = Range.init(0, 0); - var blockB = Range.init(B.start, B.start + math.min(block_size, B.length())); - blockA.start += firstA.length(); - indexA = buffer1.start; - - // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it - // otherwise, if the second buffer is available, block swap the contents into that - if (lastA.length() <= cache.len) { - const last_a_items = items[lastA.start..lastA.end]; - @memcpy(cache[0..last_a_items.len], last_a_items); - } else if (buffer2.length() > 0) { - blockSwap(T, items, lastA.start, buffer2.start, lastA.length()); - } - - if (blockA.length() > 0) { - while (true) { - // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block, - // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks. - if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) { - // figure out where to split the previous B block, and rotate it at the split - const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan); - const B_remaining = lastB.end - B_split; - - // swap the minimum A block to the beginning of the rolling A blocks - var minA = blockA.start; - findA = minA + block_size; - while (findA < blockA.end) : (findA += block_size) { - if (lessThan(context, items[findA], items[minA])) { - minA = findA; - } - } - blockSwap(T, items, blockA.start, minA, block_size); - - // swap the first item of the previous A block back with its original value, which is stored in buffer1 - mem.swap(T, &items[blockA.start], &items[indexA]); - indexA += 1; - - // locally merge the previous A block with the B values that follow it - // if lastA fits into the external cache we'll use that (with MergeExternal), - // or if the second internal buffer exists we'll use that (with MergeInternal), - // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace) - - if (lastA.length() <= cache.len) { - mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan, cache[0..]); - } else if (buffer2.length() > 0) { - mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan, buffer2); - } else { - mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan); - } - - if (buffer2.length() > 0 or block_size <= cache.len) { - // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway - if (block_size <= cache.len) { - @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]); - } else { - blockSwap(T, items, blockA.start, buffer2.start, block_size); - } - - // this is equivalent to rotating, but faster - // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it - // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs - blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining); - } else { - // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation - mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split); - } - - // update the range for the remaining A blocks, and the range remaining from the B block after it was split - lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size); - lastB = Range.init(lastA.end, lastA.end + B_remaining); - - // if there are no more A blocks remaining, this step is finished! - blockA.start += block_size; - if (blockA.length() == 0) break; - } else if (blockB.length() < block_size) { - // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation - // the cache is disabled here since it might contain the contents of the previous A block - mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start); - - lastB = Range.init(blockA.start, blockA.start + blockB.length()); - blockA.start += blockB.length(); - blockA.end += blockB.length(); - blockB.end = blockB.start; - } else { - // roll the leftmost A block to the end by swapping it with the next B block - blockSwap(T, items, blockA.start, blockB.start, block_size); - lastB = Range.init(blockA.start, blockA.start + block_size); - - blockA.start += block_size; - blockA.end += block_size; - blockB.start += block_size; - - if (blockB.end > B.end - block_size) { - blockB.end = B.end; - } else { - blockB.end += block_size; - } - } - } - } - - // merge the last A block with the remaining B values - if (lastA.length() <= cache.len) { - mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan, cache[0..]); - } else if (buffer2.length() > 0) { - mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan, buffer2); - } else { - mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan); - } - } - } - - // when we're finished with this merge step we should have the one - // or two internal buffers left over, where the second buffer is all jumbled up - // insertion sort the second buffer, then redistribute the buffers - // back into the items using the opposite process used for creating the buffer - - // while an unstable sort like quicksort could be applied here, in benchmarks - // it was consistently slightly slower than a simple insertion sort, - // even for tens of millions of items. this may be because insertion - // sort is quite fast when the data is already somewhat sorted, like it is here - insertionSort(T, items[buffer2.start..buffer2.end], context, lessThan); - - pull_index = 0; - while (pull_index < 2) : (pull_index += 1) { - var unique = pull[pull_index].count * 2; - if (pull[pull_index].from > pull[pull_index].to) { - // the values were pulled out to the left, so redistribute them back to the right - var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count); - while (buffer.length() > 0) { - index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), context, lessThan, unique); - const amount = index - buffer.end; - mem.rotate(T, items[buffer.start..index], buffer.length()); - buffer.start += (amount + 1); - buffer.end += amount; - unique -= 2; - } - } else if (pull[pull_index].from < pull[pull_index].to) { - // the values were pulled out to the right, so redistribute them back to the left - var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end); - while (buffer.length() > 0) { - index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), context, lessThan, unique); - const amount = buffer.start - index; - mem.rotate(T, items[index..buffer.end], amount); - buffer.start -= amount; - buffer.end -= (amount + 1); - unique -= 2; - } - } - } + pub fn swap(ctx: @This(), a: usize, b: usize) void { + return mem.swap(T, &ctx.items[a], &ctx.items[b]); } - - // double the size of each A and B subarray that will be merged in the next level - if (!iterator.nextLevel()) break; - } -} - -/// TODO currently this just calls `insertionSortContext`. The block sort implementation -/// in this file needs to be adapted to use the sort context. -pub fn sortContext(len: usize, context: anytype) void { - return insertionSortContext(len, context); -} - -// merge operation without a buffer -fn mergeInPlace( - comptime T: type, - items: []T, - A_arg: Range, - B_arg: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, -) void { - if (A_arg.length() == 0 or B_arg.length() == 0) return; - - // this just repeatedly binary searches into B and rotates A into position. - // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here, - // but I decided to stick with this because it had better situational performance - // - // (Hwang and Lin is designed for merging subarrays of very different sizes, - // but WikiSort almost always uses subarrays that are roughly the same size) - // - // normally this is incredibly suboptimal, but this function is only called - // when none of the A or B blocks in any subarray contained 2√A unique values, - // which places a hard limit on the number of times this will ACTUALLY need - // to binary search and rotate. - // - // according to my analysis the worst case is √A rotations performed on √A items - // once the constant factors are removed, which ends up being O(n) - // - // again, this is NOT a general-purpose solution – it only works well in this case! - // kind of like how the O(n^2) insertion sort is used in some places - - var A = A_arg; - var B = B_arg; - - while (true) { - // find the first place in B where the first item in A needs to be inserted - const mid = binaryFirst(T, items, items[A.start], B, context, lessThan); - - // rotate A into place - const amount = mid - A.end; - mem.rotate(T, items[A.start..mid], A.length()); - if (B.end == mid) break; - - // calculate the new A and B ranges - B.start = mid; - A = Range.init(A.start + amount, B.start); - A.start = binaryLast(T, items, items[A.start], A, context, lessThan); - if (A.length() == 0) break; - } -} - -// merge operation using an internal buffer -fn mergeInternal( - comptime T: type, - items: []T, - A: Range, - B: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - buffer: Range, -) void { - // whenever we find a value to add to the final array, swap it with the value that's already in that spot - // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order - var A_count: usize = 0; - var B_count: usize = 0; - var insert: usize = 0; - - if (B.length() > 0 and A.length() > 0) { - while (true) { - if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) { - mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]); - A_count += 1; - insert += 1; - if (A_count >= A.length()) break; - } else { - mem.swap(T, &items[A.start + insert], &items[B.start + B_count]); - B_count += 1; - insert += 1; - if (B_count >= B.length()) break; - } - } - } - - // swap the remainder of A into the final array - blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count); -} - -fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void { - var index: usize = 0; - while (index < block_size) : (index += 1) { - mem.swap(T, &items[start1 + index], &items[start2 + index]); - } -} - -// combine a linear search with a binary search to reduce the number of comparisons in situations -// where have some idea as to how many unique values there are and where the next value might be -fn findFirstForward( - comptime T: type, - items: []T, - value: T, - range: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - unique: usize, -) usize { - if (range.length() == 0) return range.start; - const skip = math.max(range.length() / unique, @as(usize, 1)); - - var index = range.start + skip; - while (lessThan(context, items[index - 1], value)) : (index += skip) { - if (index >= range.end - skip) { - return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan); - } - } - - return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan); -} - -fn findFirstBackward( - comptime T: type, - items: []T, - value: T, - range: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - unique: usize, -) usize { - if (range.length() == 0) return range.start; - const skip = math.max(range.length() / unique, @as(usize, 1)); - - var index = range.end - skip; - while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) { - if (index < range.start + skip) { - return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan); - } - } - - return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan); -} - -fn findLastForward( - comptime T: type, - items: []T, - value: T, - range: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - unique: usize, -) usize { - if (range.length() == 0) return range.start; - const skip = math.max(range.length() / unique, @as(usize, 1)); - - var index = range.start + skip; - while (!lessThan(context, value, items[index - 1])) : (index += skip) { - if (index >= range.end - skip) { - return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan); - } - } - - return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan); -} - -fn findLastBackward( - comptime T: type, - items: []T, - value: T, - range: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - unique: usize, -) usize { - if (range.length() == 0) return range.start; - const skip = math.max(range.length() / unique, @as(usize, 1)); - - var index = range.end - skip; - while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) { - if (index < range.start + skip) { - return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan); - } - } - - return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan); + }; + heapContext(0, items.len, Context{ .items = items, .sub_ctx = context }); } -fn binaryFirst( - comptime T: type, - items: []T, - value: T, - range: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, -) usize { - var curr = range.start; - var size = range.length(); - if (range.start >= range.end) return range.end; - while (size > 0) { - const offset = size % 2; - - size /= 2; - const mid_item = items[curr + size]; - if (lessThan(context, mid_item, value)) { - curr += size + offset; - } +/// Unstable in-place sort. O(n*log(n)) best case, worst case and average case. +/// O(1) memory (no allocator required). +/// Sorts in ascending order with respect to the given `lessThan` function. +pub fn heapContext(a: usize, b: usize, context: anytype) void { + // build the heap in linear time. + var i = b / 2; + while (i > a) : (i -= 1) { + siftDown(i - 1, b, context); } - return curr; -} - -fn binaryLast( - comptime T: type, - items: []T, - value: T, - range: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, -) usize { - var curr = range.start; - var size = range.length(); - if (range.start >= range.end) return range.end; - while (size > 0) { - const offset = size % 2; - size /= 2; - const mid_item = items[curr + size]; - if (!lessThan(context, value, mid_item)) { - curr += size + offset; - } + // pop maximal elements from the heap. + i = b; + while (i > a) : (i -= 1) { + context.swap(a, i - 1); + siftDown(a, i - 1, context); } - return curr; } -fn mergeInto( - comptime T: type, - from: []T, - A: Range, - B: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - into: []T, -) void { - var A_index: usize = A.start; - var B_index: usize = B.start; - const A_last = A.end; - const B_last = B.end; - var insert_index: usize = 0; - +fn siftDown(root: usize, n: usize, context: anytype) void { + var node = root; while (true) { - if (!lessThan(context, from[B_index], from[A_index])) { - into[insert_index] = from[A_index]; - A_index += 1; - insert_index += 1; - if (A_index == A_last) { - // copy the remainder of B into the final array - const from_b = from[B_index..B_last]; - @memcpy(into[insert_index..][0..from_b.len], from_b); - break; - } - } else { - into[insert_index] = from[B_index]; - B_index += 1; - insert_index += 1; - if (B_index == B_last) { - // copy the remainder of A into the final array - const from_a = from[A_index..A_last]; - @memcpy(into[insert_index..][0..from_a.len], from_a); - break; - } - } - } -} - -fn mergeExternal( - comptime T: type, - items: []T, - A: Range, - B: Range, - context: anytype, - comptime lessThan: fn (@TypeOf(context), T, T) bool, - cache: []T, -) void { - // A fits into the cache, so use that instead of the internal buffer - var A_index: usize = 0; - var B_index: usize = B.start; - var insert_index: usize = A.start; - const A_last = A.length(); - const B_last = B.end; + var child = 2 * node + 1; + if (child >= n) break; - if (B.length() > 0 and A.length() > 0) { - while (true) { - if (!lessThan(context, items[B_index], cache[A_index])) { - items[insert_index] = cache[A_index]; - A_index += 1; - insert_index += 1; - if (A_index == A_last) break; - } else { - items[insert_index] = items[B_index]; - B_index += 1; - insert_index += 1; - if (B_index == B_last) break; - } + // choose the greater child. + if (child + 1 < n and context.lessThan(child, child + 1)) { + child += 1; } - } - // copy the remainder of A into the final array - const cache_a = cache[A_index..A_last]; - @memcpy(items[insert_index..][0..cache_a.len], cache_a); -} + // stop if the invariant holds at `node`. + if (!context.lessThan(node, child)) break; -fn swap( - comptime T: type, - items: []T, - context: anytype, - comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, - order: *[8]u8, - x: usize, - y: usize, -) void { - if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) { - mem.swap(T, &items[x], &items[y]); - mem.swap(u8, &(order.*)[x], &(order.*)[y]); + // swap `node` with the greater child, + // move one step down, and continue sifting. + context.swap(node, child); + node = child; } } -/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, comptime asc(u8))`. +/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, asc(u8))`. pub fn asc(comptime T: type) fn (void, T, T) bool { - const impl = struct { - fn inner(context: void, a: T, b: T) bool { - _ = context; + return struct { + pub fn inner(_: void, a: T, b: T) bool { return a < b; } - }; - - return impl.inner; + }.inner; } -/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, comptime desc(u8))`. +/// Use to generate a comparator function for a given type. e.g. `sort(u8, slice, {}, desc(u8))`. pub fn desc(comptime T: type) fn (void, T, T) bool { - const impl = struct { - fn inner(context: void, a: T, b: T) bool { - _ = context; + return struct { + pub fn inner(_: void, a: T, b: T) bool { return a > b; } - }; - - return impl.inner; + }.inner; } +const asc_u8 = asc(u8); +const asc_i32 = asc(i32); +const desc_u8 = desc(u8); +const desc_i32 = desc(i32); + +const sort_funcs = &[_]fn (comptime type, anytype, anytype, comptime anytype) void{ + block, + pdq, + insertion, + heap, +}; + +const IdAndValue = struct { + id: usize, + value: i32, + + fn lessThan(context: void, a: IdAndValue, b: IdAndValue) bool { + _ = context; + return a.value < b.value; + } +}; + test "stable sort" { - try testStableSort(); - comptime try testStableSort(); -} -fn testStableSort() !void { - var expected = [_]IdAndValue{ + const expected = [_]IdAndValue{ IdAndValue{ .id = 0, .value = 0 }, IdAndValue{ .id = 1, .value = 0 }, IdAndValue{ .id = 2, .value = 0 }, @@ -1249,6 +160,7 @@ fn testStableSort() !void { IdAndValue{ .id = 1, .value = 2 }, IdAndValue{ .id = 2, .value = 2 }, }; + var cases = [_][9]IdAndValue{ [_]IdAndValue{ IdAndValue{ .id = 0, .value = 0 }, @@ -1273,26 +185,15 @@ fn testStableSort() !void { IdAndValue{ .id = 2, .value = 0 }, }, }; + for (&cases) |*case| { - insertionSort(IdAndValue, (case.*)[0..], {}, cmpByValue); + block(IdAndValue, (case.*)[0..], {}, IdAndValue.lessThan); for (case.*, 0..) |item, i| { try testing.expect(item.id == expected[i].id); try testing.expect(item.value == expected[i].value); } } } -const IdAndValue = struct { - id: usize, - value: i32, -}; -fn cmpByValue(context: void, a: IdAndValue, b: IdAndValue) bool { - return asc_i32(context, a.value, b.value); -} - -const asc_u8 = asc(u8); -const asc_i32 = asc(i32); -const desc_u8 = desc(u8); -const desc_i32 = desc(i32); test "sort" { const u8cases = [_][]const []const u8{ @@ -1322,14 +223,6 @@ test "sort" { }, }; - for (u8cases) |case| { - var buf: [8]u8 = undefined; - const slice = buf[0..case[0].len]; - @memcpy(slice, case[0]); - sort(u8, slice, {}, asc_u8); - try testing.expect(mem.eql(u8, slice, case[1])); - } - const i32cases = [_][]const []const i32{ &[_][]const i32{ &[_]i32{}, @@ -1357,12 +250,22 @@ test "sort" { }, }; - for (i32cases) |case| { - var buf: [8]i32 = undefined; - const slice = buf[0..case[0].len]; - @memcpy(slice, case[0]); - sort(i32, slice, {}, asc_i32); - try testing.expect(mem.eql(i32, slice, case[1])); + inline for (sort_funcs) |sortFn| { + for (u8cases) |case| { + var buf: [8]u8 = undefined; + const slice = buf[0..case[0].len]; + @memcpy(slice, case[0]); + sortFn(u8, slice, {}, asc_u8); + try testing.expect(mem.eql(u8, slice, case[1])); + } + + for (i32cases) |case| { + var buf: [8]i32 = undefined; + const slice = buf[0..case[0].len]; + @memcpy(slice, case[0]); + sortFn(i32, slice, {}, asc_i32); + try testing.expect(mem.eql(i32, slice, case[1])); + } } } @@ -1394,53 +297,139 @@ test "sort descending" { }, }; - for (rev_cases) |case| { - var buf: [8]i32 = undefined; - const slice = buf[0..case[0].len]; - @memcpy(slice, case[0]); - sort(i32, slice, {}, desc_i32); - try testing.expect(mem.eql(i32, slice, case[1])); + inline for (sort_funcs) |sortFn| { + for (rev_cases) |case| { + var buf: [8]i32 = undefined; + const slice = buf[0..case[0].len]; + @memcpy(slice, case[0]); + sortFn(i32, slice, {}, desc_i32); + try testing.expect(mem.eql(i32, slice, case[1])); + } } } -test "another sort case" { - var arr = [_]i32{ 5, 3, 1, 2, 4 }; - sort(i32, arr[0..], {}, asc_i32); - - try testing.expect(mem.eql(i32, &arr, &[_]i32{ 1, 2, 3, 4, 5 })); -} - test "sort fuzz testing" { var prng = std.rand.DefaultPrng.init(0x12345678); const random = prng.random(); const test_case_count = 10; - var i: usize = 0; - while (i < test_case_count) : (i += 1) { - try fuzzTest(random); + + inline for (sort_funcs) |sortFn| { + var i: usize = 0; + while (i < test_case_count) : (i += 1) { + const array_size = random.intRangeLessThan(usize, 0, 1000); + var array = try testing.allocator.alloc(i32, array_size); + defer testing.allocator.free(array); + // populate with random data + for (array) |*item| { + item.* = random.intRangeLessThan(i32, 0, 100); + } + sortFn(i32, array, {}, asc_i32); + try testing.expect(isSorted(i32, array, {}, asc_i32)); + } } } -var fixed_buffer_mem: [100 * 1024]u8 = undefined; +pub fn binarySearch( + comptime T: type, + key: anytype, + items: []const T, + context: anytype, + comptime compareFn: fn (context: @TypeOf(context), key: @TypeOf(key), mid_item: T) math.Order, +) ?usize { + var left: usize = 0; + var right: usize = items.len; -fn fuzzTest(rng: std.rand.Random) !void { - const array_size = rng.intRangeLessThan(usize, 0, 1000); - var array = try testing.allocator.alloc(IdAndValue, array_size); - defer testing.allocator.free(array); - // populate with random data - for (array, 0..) |*item, index| { - item.id = index; - item.value = rng.intRangeLessThan(i32, 0, 100); + while (left < right) { + // Avoid overflowing in the midpoint calculation + const mid = left + (right - left) / 2; + // Compare the key with the midpoint element + switch (compareFn(context, key, items[mid])) { + .eq => return mid, + .gt => left = mid + 1, + .lt => right = mid, + } } - sort(IdAndValue, array, {}, cmpByValue); - var index: usize = 1; - while (index < array.len) : (index += 1) { - if (array[index].value == array[index - 1].value) { - try testing.expect(array[index].id > array[index - 1].id); - } else { - try testing.expect(array[index].value > array[index - 1].value); + return null; +} + +test "binarySearch" { + const S = struct { + fn order_u32(context: void, lhs: u32, rhs: u32) math.Order { + _ = context; + return math.order(lhs, rhs); } - } + fn order_i32(context: void, lhs: i32, rhs: i32) math.Order { + _ = context; + return math.order(lhs, rhs); + } + }; + try testing.expectEqual( + @as(?usize, null), + binarySearch(u32, @as(u32, 1), &[_]u32{}, {}, S.order_u32), + ); + try testing.expectEqual( + @as(?usize, 0), + binarySearch(u32, @as(u32, 1), &[_]u32{1}, {}, S.order_u32), + ); + try testing.expectEqual( + @as(?usize, null), + binarySearch(u32, @as(u32, 1), &[_]u32{0}, {}, S.order_u32), + ); + try testing.expectEqual( + @as(?usize, null), + binarySearch(u32, @as(u32, 0), &[_]u32{1}, {}, S.order_u32), + ); + try testing.expectEqual( + @as(?usize, 4), + binarySearch(u32, @as(u32, 5), &[_]u32{ 1, 2, 3, 4, 5 }, {}, S.order_u32), + ); + try testing.expectEqual( + @as(?usize, 0), + binarySearch(u32, @as(u32, 2), &[_]u32{ 2, 4, 8, 16, 32, 64 }, {}, S.order_u32), + ); + try testing.expectEqual( + @as(?usize, 1), + binarySearch(i32, @as(i32, -4), &[_]i32{ -7, -4, 0, 9, 10 }, {}, S.order_i32), + ); + try testing.expectEqual( + @as(?usize, 3), + binarySearch(i32, @as(i32, 98), &[_]i32{ -100, -25, 2, 98, 99, 100 }, {}, S.order_i32), + ); + const R = struct { + b: i32, + e: i32, + + fn r(b: i32, e: i32) @This() { + return @This(){ .b = b, .e = e }; + } + + fn order(context: void, key: i32, mid_item: @This()) math.Order { + _ = context; + + if (key < mid_item.b) { + return .lt; + } + + if (key > mid_item.e) { + return .gt; + } + + return .eq; + } + }; + try testing.expectEqual( + @as(?usize, null), + binarySearch(R, @as(i32, -45), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order), + ); + try testing.expectEqual( + @as(?usize, 2), + binarySearch(R, @as(i32, 10), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order), + ); + try testing.expectEqual( + @as(?usize, 1), + binarySearch(R, @as(i32, -20), &[_]R{ R.r(-100, -50), R.r(-40, -20), R.r(-10, 20), R.r(30, 40) }, {}, R.order), + ); } pub fn argMin( diff --git a/lib/std/sort/block.zig b/lib/std/sort/block.zig new file mode 100644 index 0000000000..6c1be9c6c2 --- /dev/null +++ b/lib/std/sort/block.zig @@ -0,0 +1,1066 @@ +const std = @import("../std.zig"); +const sort = std.sort; +const math = std.math; +const mem = std.mem; + +const Range = struct { + start: usize, + end: usize, + + fn init(start: usize, end: usize) Range { + return Range{ + .start = start, + .end = end, + }; + } + + fn length(self: Range) usize { + return self.end - self.start; + } +}; + +const Iterator = struct { + size: usize, + power_of_two: usize, + numerator: usize, + decimal: usize, + denominator: usize, + decimal_step: usize, + numerator_step: usize, + + fn init(size2: usize, min_level: usize) Iterator { + const power_of_two = math.floorPowerOfTwo(usize, size2); + const denominator = power_of_two / min_level; + return Iterator{ + .numerator = 0, + .decimal = 0, + .size = size2, + .power_of_two = power_of_two, + .denominator = denominator, + .decimal_step = size2 / denominator, + .numerator_step = size2 % denominator, + }; + } + + fn begin(self: *Iterator) void { + self.numerator = 0; + self.decimal = 0; + } + + fn nextRange(self: *Iterator) Range { + const start = self.decimal; + + self.decimal += self.decimal_step; + self.numerator += self.numerator_step; + if (self.numerator >= self.denominator) { + self.numerator -= self.denominator; + self.decimal += 1; + } + + return Range{ + .start = start, + .end = self.decimal, + }; + } + + fn finished(self: *Iterator) bool { + return self.decimal >= self.size; + } + + fn nextLevel(self: *Iterator) bool { + self.decimal_step += self.decimal_step; + self.numerator_step += self.numerator_step; + if (self.numerator_step >= self.denominator) { + self.numerator_step -= self.denominator; + self.decimal_step += 1; + } + + return (self.decimal_step < self.size); + } + + fn length(self: *Iterator) usize { + return self.decimal_step; + } +}; + +const Pull = struct { + from: usize, + to: usize, + count: usize, + range: Range, +}; + +/// Stable in-place sort. O(n) best case, O(n*log(n)) worst case and average case. +/// O(1) memory (no allocator required). +/// Sorts in ascending order with respect to the given `lessThan` function. +/// +/// NOTE: the algorithm only work when the comparison is less-than or greater-than +/// (See https://github.com/ziglang/zig/issues/8289) +pub fn block( + comptime T: type, + items: []T, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + + // Implementation ported from https://github.com/BonzaiThePenguin/WikiSort/blob/master/WikiSort.c + var cache: [512]T = undefined; + + if (items.len < 4) { + if (items.len == 3) { + // hard coded insertion sort + if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]); + if (lessThan(context, items[2], items[1])) { + mem.swap(T, &items[1], &items[2]); + if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]); + } + } else if (items.len == 2) { + if (lessThan(context, items[1], items[0])) mem.swap(T, &items[0], &items[1]); + } + return; + } + + // sort groups of 4-8 items at a time using an unstable sorting network, + // but keep track of the original item orders to force it to be stable + // http://pages.ripco.net/~jgamble/nw.html + var iterator = Iterator.init(items.len, 4); + while (!iterator.finished()) { + var order = [_]u8{ 0, 1, 2, 3, 4, 5, 6, 7 }; + const range = iterator.nextRange(); + + const sliced_items = items[range.start..]; + switch (range.length()) { + 8 => { + swap(T, sliced_items, &order, 0, 1, context, lessThan); + swap(T, sliced_items, &order, 2, 3, context, lessThan); + swap(T, sliced_items, &order, 4, 5, context, lessThan); + swap(T, sliced_items, &order, 6, 7, context, lessThan); + swap(T, sliced_items, &order, 0, 2, context, lessThan); + swap(T, sliced_items, &order, 1, 3, context, lessThan); + swap(T, sliced_items, &order, 4, 6, context, lessThan); + swap(T, sliced_items, &order, 5, 7, context, lessThan); + swap(T, sliced_items, &order, 1, 2, context, lessThan); + swap(T, sliced_items, &order, 5, 6, context, lessThan); + swap(T, sliced_items, &order, 0, 4, context, lessThan); + swap(T, sliced_items, &order, 3, 7, context, lessThan); + swap(T, sliced_items, &order, 1, 5, context, lessThan); + swap(T, sliced_items, &order, 2, 6, context, lessThan); + swap(T, sliced_items, &order, 1, 4, context, lessThan); + swap(T, sliced_items, &order, 3, 6, context, lessThan); + swap(T, sliced_items, &order, 2, 4, context, lessThan); + swap(T, sliced_items, &order, 3, 5, context, lessThan); + swap(T, sliced_items, &order, 3, 4, context, lessThan); + }, + 7 => { + swap(T, sliced_items, &order, 1, 2, context, lessThan); + swap(T, sliced_items, &order, 3, 4, context, lessThan); + swap(T, sliced_items, &order, 5, 6, context, lessThan); + swap(T, sliced_items, &order, 0, 2, context, lessThan); + swap(T, sliced_items, &order, 3, 5, context, lessThan); + swap(T, sliced_items, &order, 4, 6, context, lessThan); + swap(T, sliced_items, &order, 0, 1, context, lessThan); + swap(T, sliced_items, &order, 4, 5, context, lessThan); + swap(T, sliced_items, &order, 2, 6, context, lessThan); + swap(T, sliced_items, &order, 0, 4, context, lessThan); + swap(T, sliced_items, &order, 1, 5, context, lessThan); + swap(T, sliced_items, &order, 0, 3, context, lessThan); + swap(T, sliced_items, &order, 2, 5, context, lessThan); + swap(T, sliced_items, &order, 1, 3, context, lessThan); + swap(T, sliced_items, &order, 2, 4, context, lessThan); + swap(T, sliced_items, &order, 2, 3, context, lessThan); + }, + 6 => { + swap(T, sliced_items, &order, 1, 2, context, lessThan); + swap(T, sliced_items, &order, 4, 5, context, lessThan); + swap(T, sliced_items, &order, 0, 2, context, lessThan); + swap(T, sliced_items, &order, 3, 5, context, lessThan); + swap(T, sliced_items, &order, 0, 1, context, lessThan); + swap(T, sliced_items, &order, 3, 4, context, lessThan); + swap(T, sliced_items, &order, 2, 5, context, lessThan); + swap(T, sliced_items, &order, 0, 3, context, lessThan); + swap(T, sliced_items, &order, 1, 4, context, lessThan); + swap(T, sliced_items, &order, 2, 4, context, lessThan); + swap(T, sliced_items, &order, 1, 3, context, lessThan); + swap(T, sliced_items, &order, 2, 3, context, lessThan); + }, + 5 => { + swap(T, sliced_items, &order, 0, 1, context, lessThan); + swap(T, sliced_items, &order, 3, 4, context, lessThan); + swap(T, sliced_items, &order, 2, 4, context, lessThan); + swap(T, sliced_items, &order, 2, 3, context, lessThan); + swap(T, sliced_items, &order, 1, 4, context, lessThan); + swap(T, sliced_items, &order, 0, 3, context, lessThan); + swap(T, sliced_items, &order, 0, 2, context, lessThan); + swap(T, sliced_items, &order, 1, 3, context, lessThan); + swap(T, sliced_items, &order, 1, 2, context, lessThan); + }, + 4 => { + swap(T, sliced_items, &order, 0, 1, context, lessThan); + swap(T, sliced_items, &order, 2, 3, context, lessThan); + swap(T, sliced_items, &order, 0, 2, context, lessThan); + swap(T, sliced_items, &order, 1, 3, context, lessThan); + swap(T, sliced_items, &order, 1, 2, context, lessThan); + }, + else => {}, + } + } + if (items.len < 8) return; + + // then merge sort the higher levels, which can be 8-15, 16-31, 32-63, 64-127, etc. + while (true) { + // if every A and B block will fit into the cache, use a special branch + // specifically for merging with the cache + // (we use < rather than <= since the block size might be one more than + // iterator.length()) + if (iterator.length() < cache.len) { + // if four subarrays fit into the cache, it's faster to merge both + // pairs of subarrays into the cache, + // then merge the two merged subarrays from the cache back into the original array + if ((iterator.length() + 1) * 4 <= cache.len and iterator.length() * 4 <= items.len) { + iterator.begin(); + while (!iterator.finished()) { + // merge A1 and B1 into the cache + var A1 = iterator.nextRange(); + var B1 = iterator.nextRange(); + var A2 = iterator.nextRange(); + var B2 = iterator.nextRange(); + + if (lessThan(context, items[B1.end - 1], items[A1.start])) { + // the two ranges are in reverse order, so copy them in reverse order into the cache + const a1_items = items[A1.start..A1.end]; + @memcpy(cache[B1.length()..][0..a1_items.len], a1_items); + const b1_items = items[B1.start..B1.end]; + @memcpy(cache[0..b1_items.len], b1_items); + } else if (lessThan(context, items[B1.start], items[A1.end - 1])) { + // these two ranges weren't already in order, so merge them into the cache + mergeInto(T, items, A1, B1, cache[0..], context, lessThan); + } else { + // if A1, B1, A2, and B2 are all in order, skip doing anything else + if (!lessThan(context, items[B2.start], items[A2.end - 1]) and !lessThan(context, items[A2.start], items[B1.end - 1])) continue; + + // copy A1 and B1 into the cache in the same order + const a1_items = items[A1.start..A1.end]; + @memcpy(cache[0..a1_items.len], a1_items); + const b1_items = items[B1.start..B1.end]; + @memcpy(cache[A1.length()..][0..b1_items.len], b1_items); + } + A1 = Range.init(A1.start, B1.end); + + // merge A2 and B2 into the cache + if (lessThan(context, items[B2.end - 1], items[A2.start])) { + // the two ranges are in reverse order, so copy them in reverse order into the cache + const a2_items = items[A2.start..A2.end]; + @memcpy(cache[A1.length() + B2.length() ..][0..a2_items.len], a2_items); + const b2_items = items[B2.start..B2.end]; + @memcpy(cache[A1.length()..][0..b2_items.len], b2_items); + } else if (lessThan(context, items[B2.start], items[A2.end - 1])) { + // these two ranges weren't already in order, so merge them into the cache + mergeInto(T, items, A2, B2, cache[A1.length()..], context, lessThan); + } else { + // copy A2 and B2 into the cache in the same order + const a2_items = items[A2.start..A2.end]; + @memcpy(cache[A1.length()..][0..a2_items.len], a2_items); + const b2_items = items[B2.start..B2.end]; + @memcpy(cache[A1.length() + A2.length() ..][0..b2_items.len], b2_items); + } + A2 = Range.init(A2.start, B2.end); + + // merge A1 and A2 from the cache into the items + const A3 = Range.init(0, A1.length()); + const B3 = Range.init(A1.length(), A1.length() + A2.length()); + + if (lessThan(context, cache[B3.end - 1], cache[A3.start])) { + // the two ranges are in reverse order, so copy them in reverse order into the items + const a3_items = cache[A3.start..A3.end]; + @memcpy(items[A1.start + A2.length() ..][0..a3_items.len], a3_items); + const b3_items = cache[B3.start..B3.end]; + @memcpy(items[A1.start..][0..b3_items.len], b3_items); + } else if (lessThan(context, cache[B3.start], cache[A3.end - 1])) { + // these two ranges weren't already in order, so merge them back into the items + mergeInto(T, cache[0..], A3, B3, items[A1.start..], context, lessThan); + } else { + // copy A3 and B3 into the items in the same order + const a3_items = cache[A3.start..A3.end]; + @memcpy(items[A1.start..][0..a3_items.len], a3_items); + const b3_items = cache[B3.start..B3.end]; + @memcpy(items[A1.start + A1.length() ..][0..b3_items.len], b3_items); + } + } + + // we merged two levels at the same time, so we're done with this level already + // (iterator.nextLevel() is called again at the bottom of this outer merge loop) + _ = iterator.nextLevel(); + } else { + iterator.begin(); + while (!iterator.finished()) { + var A = iterator.nextRange(); + var B = iterator.nextRange(); + + if (lessThan(context, items[B.end - 1], items[A.start])) { + // the two ranges are in reverse order, so a simple rotation should fix it + mem.rotate(T, items[A.start..B.end], A.length()); + } else if (lessThan(context, items[B.start], items[A.end - 1])) { + // these two ranges weren't already in order, so we'll need to merge them! + const a_items = items[A.start..A.end]; + @memcpy(cache[0..a_items.len], a_items); + mergeExternal(T, items, A, B, cache[0..], context, lessThan); + } + } + } + } else { + // this is where the in-place merge logic starts! + // 1. pull out two internal buffers each containing √A unique values + // 1a. adjust block_size and buffer_size if we couldn't find enough unique values + // 2. loop over the A and B subarrays within this level of the merge sort + // 3. break A and B into blocks of size 'block_size' + // 4. "tag" each of the A blocks with values from the first internal buffer + // 5. roll the A blocks through the B blocks and drop/rotate them where they belong + // 6. merge each A block with any B values that follow, using the cache or the second internal buffer + // 7. sort the second internal buffer if it exists + // 8. redistribute the two internal buffers back into the items + var block_size: usize = math.sqrt(iterator.length()); + var buffer_size = iterator.length() / block_size + 1; + + // as an optimization, we really only need to pull out the internal buffers once for each level of merges + // after that we can reuse the same buffers over and over, then redistribute it when we're finished with this level + var A: Range = undefined; + var B: Range = undefined; + var index: usize = 0; + var last: usize = 0; + var count: usize = 0; + var find: usize = 0; + var start: usize = 0; + var pull_index: usize = 0; + var pull = [_]Pull{ + Pull{ + .from = 0, + .to = 0, + .count = 0, + .range = Range.init(0, 0), + }, + Pull{ + .from = 0, + .to = 0, + .count = 0, + .range = Range.init(0, 0), + }, + }; + + var buffer1 = Range.init(0, 0); + var buffer2 = Range.init(0, 0); + + // find two internal buffers of size 'buffer_size' each + find = buffer_size + buffer_size; + var find_separately = false; + + if (block_size <= cache.len) { + // if every A block fits into the cache then we won't need the second internal buffer, + // so we really only need to find 'buffer_size' unique values + find = buffer_size; + } else if (find > iterator.length()) { + // we can't fit both buffers into the same A or B subarray, so find two buffers separately + find = buffer_size; + find_separately = true; + } + + // we need to find either a single contiguous space containing 2√A unique values (which will be split up into two buffers of size √A each), + // or we need to find one buffer of < 2√A unique values, and a second buffer of √A unique values, + // OR if we couldn't find that many unique values, we need the largest possible buffer we can get + + // in the case where it couldn't find a single buffer of at least √A unique values, + // all of the Merge steps must be replaced by a different merge algorithm (MergeInPlace) + iterator.begin(); + while (!iterator.finished()) { + A = iterator.nextRange(); + B = iterator.nextRange(); + + // just store information about where the values will be pulled from and to, + // as well as how many values there are, to create the two internal buffers + + // check A for the number of unique values we need to fill an internal buffer + // these values will be pulled out to the start of A + last = A.start; + count = 1; + while (count < find) : ({ + last = index; + count += 1; + }) { + index = findLastForward(T, items, items[last], Range.init(last + 1, A.end), find - count, context, lessThan); + if (index == A.end) break; + } + index = last; + + if (count >= buffer_size) { + // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffer + pull[pull_index] = Pull{ + .range = Range.init(A.start, B.end), + .count = count, + .from = index, + .to = A.start, + }; + pull_index = 1; + + if (count == buffer_size + buffer_size) { + // we were able to find a single contiguous section containing 2√A unique values, + // so this section can be used to contain both of the internal buffers we'll need + buffer1 = Range.init(A.start, A.start + buffer_size); + buffer2 = Range.init(A.start + buffer_size, A.start + count); + break; + } else if (find == buffer_size + buffer_size) { + // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values, + // so we still need to find a second separate buffer of at least √A unique values + buffer1 = Range.init(A.start, A.start + count); + find = buffer_size; + } else if (block_size <= cache.len) { + // we found the first and only internal buffer that we need, so we're done! + buffer1 = Range.init(A.start, A.start + count); + break; + } else if (find_separately) { + // found one buffer, but now find the other one + buffer1 = Range.init(A.start, A.start + count); + find_separately = false; + } else { + // we found a second buffer in an 'A' subarray containing √A unique values, so we're done! + buffer2 = Range.init(A.start, A.start + count); + break; + } + } else if (pull_index == 0 and count > buffer1.length()) { + // keep track of the largest buffer we were able to find + buffer1 = Range.init(A.start, A.start + count); + pull[pull_index] = Pull{ + .range = Range.init(A.start, B.end), + .count = count, + .from = index, + .to = A.start, + }; + } + + // check B for the number of unique values we need to fill an internal buffer + // these values will be pulled out to the end of B + last = B.end - 1; + count = 1; + while (count < find) : ({ + last = index - 1; + count += 1; + }) { + index = findFirstBackward(T, items, items[last], Range.init(B.start, last), find - count, context, lessThan); + if (index == B.start) break; + } + index = last; + + if (count >= buffer_size) { + // keep track of the range within the items where we'll need to "pull out" these values to create the internal buffe + pull[pull_index] = Pull{ + .range = Range.init(A.start, B.end), + .count = count, + .from = index, + .to = B.end, + }; + pull_index = 1; + + if (count == buffer_size + buffer_size) { + // we were able to find a single contiguous section containing 2√A unique values, + // so this section can be used to contain both of the internal buffers we'll need + buffer1 = Range.init(B.end - count, B.end - buffer_size); + buffer2 = Range.init(B.end - buffer_size, B.end); + break; + } else if (find == buffer_size + buffer_size) { + // we found a buffer that contains at least √A unique values, but did not contain the full 2√A unique values, + // so we still need to find a second separate buffer of at least √A unique values + buffer1 = Range.init(B.end - count, B.end); + find = buffer_size; + } else if (block_size <= cache.len) { + // we found the first and only internal buffer that we need, so we're done! + buffer1 = Range.init(B.end - count, B.end); + break; + } else if (find_separately) { + // found one buffer, but now find the other one + buffer1 = Range.init(B.end - count, B.end); + find_separately = false; + } else { + // buffer2 will be pulled out from a 'B' subarray, so if the first buffer was pulled out from the corresponding 'A' subarray, + // we need to adjust the end point for that A subarray so it knows to stop redistributing its values before reaching buffer2 + if (pull[0].range.start == A.start) pull[0].range.end -= pull[1].count; + + // we found a second buffer in an 'B' subarray containing √A unique values, so we're done! + buffer2 = Range.init(B.end - count, B.end); + break; + } + } else if (pull_index == 0 and count > buffer1.length()) { + // keep track of the largest buffer we were able to find + buffer1 = Range.init(B.end - count, B.end); + pull[pull_index] = Pull{ + .range = Range.init(A.start, B.end), + .count = count, + .from = index, + .to = B.end, + }; + } + } + + // pull out the two ranges so we can use them as internal buffers + pull_index = 0; + while (pull_index < 2) : (pull_index += 1) { + const length = pull[pull_index].count; + + if (pull[pull_index].to < pull[pull_index].from) { + // we're pulling the values out to the left, which means the start of an A subarray + index = pull[pull_index].from; + count = 1; + while (count < length) : (count += 1) { + index = findFirstBackward(T, items, items[index - 1], Range.init(pull[pull_index].to, pull[pull_index].from - (count - 1)), length - count, context, lessThan); + const range = Range.init(index + 1, pull[pull_index].from + 1); + mem.rotate(T, items[range.start..range.end], range.length() - count); + pull[pull_index].from = index + count; + } + } else if (pull[pull_index].to > pull[pull_index].from) { + // we're pulling values out to the right, which means the end of a B subarray + index = pull[pull_index].from + 1; + count = 1; + while (count < length) : (count += 1) { + index = findLastForward(T, items, items[index], Range.init(index, pull[pull_index].to), length - count, context, lessThan); + const range = Range.init(pull[pull_index].from, index - 1); + mem.rotate(T, items[range.start..range.end], count); + pull[pull_index].from = index - 1 - count; + } + } + } + + // adjust block_size and buffer_size based on the values we were able to pull out + buffer_size = buffer1.length(); + block_size = iterator.length() / buffer_size + 1; + + // the first buffer NEEDS to be large enough to tag each of the evenly sized A blocks, + // so this was originally here to test the math for adjusting block_size above + // assert((iterator.length() + 1)/block_size <= buffer_size); + + // now that the two internal buffers have been created, it's time to merge each A+B combination at this level of the merge sort! + iterator.begin(); + while (!iterator.finished()) { + A = iterator.nextRange(); + B = iterator.nextRange(); + + // remove any parts of A or B that are being used by the internal buffers + start = A.start; + if (start == pull[0].range.start) { + if (pull[0].from > pull[0].to) { + A.start += pull[0].count; + + // if the internal buffer takes up the entire A or B subarray, then there's nothing to merge + // this only happens for very small subarrays, like √4 = 2, 2 * (2 internal buffers) = 4, + // which also only happens when cache.len is small or 0 since it'd otherwise use MergeExternal + if (A.length() == 0) continue; + } else if (pull[0].from < pull[0].to) { + B.end -= pull[0].count; + if (B.length() == 0) continue; + } + } + if (start == pull[1].range.start) { + if (pull[1].from > pull[1].to) { + A.start += pull[1].count; + if (A.length() == 0) continue; + } else if (pull[1].from < pull[1].to) { + B.end -= pull[1].count; + if (B.length() == 0) continue; + } + } + + if (lessThan(context, items[B.end - 1], items[A.start])) { + // the two ranges are in reverse order, so a simple rotation should fix it + mem.rotate(T, items[A.start..B.end], A.length()); + } else if (lessThan(context, items[A.end], items[A.end - 1])) { + // these two ranges weren't already in order, so we'll need to merge them! + var findA: usize = undefined; + + // break the remainder of A into blocks. firstA is the uneven-sized first A block + var blockA = Range.init(A.start, A.end); + var firstA = Range.init(A.start, A.start + blockA.length() % block_size); + + // swap the first value of each A block with the value in buffer1 + var indexA = buffer1.start; + index = firstA.end; + while (index < blockA.end) : ({ + indexA += 1; + index += block_size; + }) { + mem.swap(T, &items[indexA], &items[index]); + } + + // start rolling the A blocks through the B blocks! + // whenever we leave an A block behind, we'll need to merge the previous A block with any B blocks that follow it, so track that information as well + var lastA = firstA; + var lastB = Range.init(0, 0); + var blockB = Range.init(B.start, B.start + math.min(block_size, B.length())); + blockA.start += firstA.length(); + indexA = buffer1.start; + + // if the first unevenly sized A block fits into the cache, copy it there for when we go to Merge it + // otherwise, if the second buffer is available, block swap the contents into that + if (lastA.length() <= cache.len) { + const last_a_items = items[lastA.start..lastA.end]; + @memcpy(cache[0..last_a_items.len], last_a_items); + } else if (buffer2.length() > 0) { + blockSwap(T, items, lastA.start, buffer2.start, lastA.length()); + } + + if (blockA.length() > 0) { + while (true) { + // if there's a previous B block and the first value of the minimum A block is <= the last value of the previous B block, + // then drop that minimum A block behind. or if there are no B blocks left then keep dropping the remaining A blocks. + if ((lastB.length() > 0 and !lessThan(context, items[lastB.end - 1], items[indexA])) or blockB.length() == 0) { + // figure out where to split the previous B block, and rotate it at the split + const B_split = binaryFirst(T, items, items[indexA], lastB, context, lessThan); + const B_remaining = lastB.end - B_split; + + // swap the minimum A block to the beginning of the rolling A blocks + var minA = blockA.start; + findA = minA + block_size; + while (findA < blockA.end) : (findA += block_size) { + if (lessThan(context, items[findA], items[minA])) { + minA = findA; + } + } + blockSwap(T, items, blockA.start, minA, block_size); + + // swap the first item of the previous A block back with its original value, which is stored in buffer1 + mem.swap(T, &items[blockA.start], &items[indexA]); + indexA += 1; + + // locally merge the previous A block with the B values that follow it + // if lastA fits into the external cache we'll use that (with MergeExternal), + // or if the second internal buffer exists we'll use that (with MergeInternal), + // or failing that we'll use a strictly in-place merge algorithm (MergeInPlace) + + if (lastA.length() <= cache.len) { + mergeExternal(T, items, lastA, Range.init(lastA.end, B_split), cache[0..], context, lessThan); + } else if (buffer2.length() > 0) { + mergeInternal(T, items, lastA, Range.init(lastA.end, B_split), buffer2, context, lessThan); + } else { + mergeInPlace(T, items, lastA, Range.init(lastA.end, B_split), context, lessThan); + } + + if (buffer2.length() > 0 or block_size <= cache.len) { + // copy the previous A block into the cache or buffer2, since that's where we need it to be when we go to merge it anyway + if (block_size <= cache.len) { + @memcpy(cache[0..block_size], items[blockA.start..][0..block_size]); + } else { + blockSwap(T, items, blockA.start, buffer2.start, block_size); + } + + // this is equivalent to rotating, but faster + // the area normally taken up by the A block is either the contents of buffer2, or data we don't need anymore since we memcopied it + // either way, we don't need to retain the order of those items, so instead of rotating we can just block swap B to where it belongs + blockSwap(T, items, B_split, blockA.start + block_size - B_remaining, B_remaining); + } else { + // we are unable to use the 'buffer2' trick to speed up the rotation operation since buffer2 doesn't exist, so perform a normal rotation + mem.rotate(T, items[B_split .. blockA.start + block_size], blockA.start - B_split); + } + + // update the range for the remaining A blocks, and the range remaining from the B block after it was split + lastA = Range.init(blockA.start - B_remaining, blockA.start - B_remaining + block_size); + lastB = Range.init(lastA.end, lastA.end + B_remaining); + + // if there are no more A blocks remaining, this step is finished! + blockA.start += block_size; + if (blockA.length() == 0) break; + } else if (blockB.length() < block_size) { + // move the last B block, which is unevenly sized, to before the remaining A blocks, by using a rotation + // the cache is disabled here since it might contain the contents of the previous A block + mem.rotate(T, items[blockA.start..blockB.end], blockB.start - blockA.start); + + lastB = Range.init(blockA.start, blockA.start + blockB.length()); + blockA.start += blockB.length(); + blockA.end += blockB.length(); + blockB.end = blockB.start; + } else { + // roll the leftmost A block to the end by swapping it with the next B block + blockSwap(T, items, blockA.start, blockB.start, block_size); + lastB = Range.init(blockA.start, blockA.start + block_size); + + blockA.start += block_size; + blockA.end += block_size; + blockB.start += block_size; + + if (blockB.end > B.end - block_size) { + blockB.end = B.end; + } else { + blockB.end += block_size; + } + } + } + } + + // merge the last A block with the remaining B values + if (lastA.length() <= cache.len) { + mergeExternal(T, items, lastA, Range.init(lastA.end, B.end), cache[0..], context, lessThan); + } else if (buffer2.length() > 0) { + mergeInternal(T, items, lastA, Range.init(lastA.end, B.end), buffer2, context, lessThan); + } else { + mergeInPlace(T, items, lastA, Range.init(lastA.end, B.end), context, lessThan); + } + } + } + + // when we're finished with this merge step we should have the one + // or two internal buffers left over, where the second buffer is all jumbled up + // insertion sort the second buffer, then redistribute the buffers + // back into the items using the opposite process used for creating the buffer + + // while an unstable sort like quicksort could be applied here, in benchmarks + // it was consistently slightly slower than a simple insertion sort, + // even for tens of millions of items. this may be because insertion + // sort is quite fast when the data is already somewhat sorted, like it is here + sort.insertion(T, items[buffer2.start..buffer2.end], context, lessThan); + + pull_index = 0; + while (pull_index < 2) : (pull_index += 1) { + var unique = pull[pull_index].count * 2; + if (pull[pull_index].from > pull[pull_index].to) { + // the values were pulled out to the left, so redistribute them back to the right + var buffer = Range.init(pull[pull_index].range.start, pull[pull_index].range.start + pull[pull_index].count); + while (buffer.length() > 0) { + index = findFirstForward(T, items, items[buffer.start], Range.init(buffer.end, pull[pull_index].range.end), unique, context, lessThan); + const amount = index - buffer.end; + mem.rotate(T, items[buffer.start..index], buffer.length()); + buffer.start += (amount + 1); + buffer.end += amount; + unique -= 2; + } + } else if (pull[pull_index].from < pull[pull_index].to) { + // the values were pulled out to the right, so redistribute them back to the left + var buffer = Range.init(pull[pull_index].range.end - pull[pull_index].count, pull[pull_index].range.end); + while (buffer.length() > 0) { + index = findLastBackward(T, items, items[buffer.end - 1], Range.init(pull[pull_index].range.start, buffer.start), unique, context, lessThan); + const amount = buffer.start - index; + mem.rotate(T, items[index..buffer.end], amount); + buffer.start -= amount; + buffer.end -= (amount + 1); + unique -= 2; + } + } + } + } + + // double the size of each A and B subarray that will be merged in the next level + if (!iterator.nextLevel()) break; + } +} +// merge operation without a buffer +fn mergeInPlace( + comptime T: type, + items: []T, + A_arg: Range, + B_arg: Range, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + if (A_arg.length() == 0 or B_arg.length() == 0) return; + + // this just repeatedly binary searches into B and rotates A into position. + // the paper suggests using the 'rotation-based Hwang and Lin algorithm' here, + // but I decided to stick with this because it had better situational performance + // + // (Hwang and Lin is designed for merging subarrays of very different sizes, + // but WikiSort almost always uses subarrays that are roughly the same size) + // + // normally this is incredibly suboptimal, but this function is only called + // when none of the A or B blocks in any subarray contained 2√A unique values, + // which places a hard limit on the number of times this will ACTUALLY need + // to binary search and rotate. + // + // according to my analysis the worst case is √A rotations performed on √A items + // once the constant factors are removed, which ends up being O(n) + // + // again, this is NOT a general-purpose solution – it only works well in this case! + // kind of like how the O(n^2) insertion sort is used in some places + + var A = A_arg; + var B = B_arg; + + while (true) { + // find the first place in B where the first item in A needs to be inserted + const mid = binaryFirst(T, items, items[A.start], B, context, lessThan); + + // rotate A into place + const amount = mid - A.end; + mem.rotate(T, items[A.start..mid], A.length()); + if (B.end == mid) break; + + // calculate the new A and B ranges + B.start = mid; + A = Range.init(A.start + amount, B.start); + A.start = binaryLast(T, items, items[A.start], A, context, lessThan); + if (A.length() == 0) break; + } +} + +// merge operation using an internal buffer +fn mergeInternal( + comptime T: type, + items: []T, + A: Range, + B: Range, + buffer: Range, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + // whenever we find a value to add to the final array, swap it with the value that's already in that spot + // when this algorithm is finished, 'buffer' will contain its original contents, but in a different order + var A_count: usize = 0; + var B_count: usize = 0; + var insert: usize = 0; + + if (B.length() > 0 and A.length() > 0) { + while (true) { + if (!lessThan(context, items[B.start + B_count], items[buffer.start + A_count])) { + mem.swap(T, &items[A.start + insert], &items[buffer.start + A_count]); + A_count += 1; + insert += 1; + if (A_count >= A.length()) break; + } else { + mem.swap(T, &items[A.start + insert], &items[B.start + B_count]); + B_count += 1; + insert += 1; + if (B_count >= B.length()) break; + } + } + } + + // swap the remainder of A into the final array + blockSwap(T, items, buffer.start + A_count, A.start + insert, A.length() - A_count); +} + +fn blockSwap(comptime T: type, items: []T, start1: usize, start2: usize, block_size: usize) void { + var index: usize = 0; + while (index < block_size) : (index += 1) { + mem.swap(T, &items[start1 + index], &items[start2 + index]); + } +} + +// combine a linear search with a binary search to reduce the number of comparisons in situations +// where have some idea as to how many unique values there are and where the next value might be +fn findFirstForward( + comptime T: type, + items: []T, + value: T, + range: Range, + unique: usize, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) usize { + if (range.length() == 0) return range.start; + const skip = math.max(range.length() / unique, @as(usize, 1)); + + var index = range.start + skip; + while (lessThan(context, items[index - 1], value)) : (index += skip) { + if (index >= range.end - skip) { + return binaryFirst(T, items, value, Range.init(index, range.end), context, lessThan); + } + } + + return binaryFirst(T, items, value, Range.init(index - skip, index), context, lessThan); +} + +fn findFirstBackward( + comptime T: type, + items: []T, + value: T, + range: Range, + unique: usize, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) usize { + if (range.length() == 0) return range.start; + const skip = math.max(range.length() / unique, @as(usize, 1)); + + var index = range.end - skip; + while (index > range.start and !lessThan(context, items[index - 1], value)) : (index -= skip) { + if (index < range.start + skip) { + return binaryFirst(T, items, value, Range.init(range.start, index), context, lessThan); + } + } + + return binaryFirst(T, items, value, Range.init(index, index + skip), context, lessThan); +} + +fn findLastForward( + comptime T: type, + items: []T, + value: T, + range: Range, + unique: usize, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) usize { + if (range.length() == 0) return range.start; + const skip = math.max(range.length() / unique, @as(usize, 1)); + + var index = range.start + skip; + while (!lessThan(context, value, items[index - 1])) : (index += skip) { + if (index >= range.end - skip) { + return binaryLast(T, items, value, Range.init(index, range.end), context, lessThan); + } + } + + return binaryLast(T, items, value, Range.init(index - skip, index), context, lessThan); +} + +fn findLastBackward( + comptime T: type, + items: []T, + value: T, + range: Range, + unique: usize, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) usize { + if (range.length() == 0) return range.start; + const skip = math.max(range.length() / unique, @as(usize, 1)); + + var index = range.end - skip; + while (index > range.start and lessThan(context, value, items[index - 1])) : (index -= skip) { + if (index < range.start + skip) { + return binaryLast(T, items, value, Range.init(range.start, index), context, lessThan); + } + } + + return binaryLast(T, items, value, Range.init(index, index + skip), context, lessThan); +} + +fn binaryFirst( + comptime T: type, + items: []T, + value: T, + range: Range, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) usize { + var curr = range.start; + var size = range.length(); + if (range.start >= range.end) return range.end; + while (size > 0) { + const offset = size % 2; + + size /= 2; + const mid_item = items[curr + size]; + if (lessThan(context, mid_item, value)) { + curr += size + offset; + } + } + return curr; +} + +fn binaryLast( + comptime T: type, + items: []T, + value: T, + range: Range, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) usize { + var curr = range.start; + var size = range.length(); + if (range.start >= range.end) return range.end; + while (size > 0) { + const offset = size % 2; + + size /= 2; + const mid_item = items[curr + size]; + if (!lessThan(context, value, mid_item)) { + curr += size + offset; + } + } + return curr; +} + +fn mergeInto( + comptime T: type, + from: []T, + A: Range, + B: Range, + into: []T, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + var A_index: usize = A.start; + var B_index: usize = B.start; + const A_last = A.end; + const B_last = B.end; + var insert_index: usize = 0; + + while (true) { + if (!lessThan(context, from[B_index], from[A_index])) { + into[insert_index] = from[A_index]; + A_index += 1; + insert_index += 1; + if (A_index == A_last) { + // copy the remainder of B into the final array + const from_b = from[B_index..B_last]; + @memcpy(into[insert_index..][0..from_b.len], from_b); + break; + } + } else { + into[insert_index] = from[B_index]; + B_index += 1; + insert_index += 1; + if (B_index == B_last) { + // copy the remainder of A into the final array + const from_a = from[A_index..A_last]; + @memcpy(into[insert_index..][0..from_a.len], from_a); + break; + } + } + } +} + +fn mergeExternal( + comptime T: type, + items: []T, + A: Range, + B: Range, + cache: []T, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + // A fits into the cache, so use that instead of the internal buffer + var A_index: usize = 0; + var B_index: usize = B.start; + var insert_index: usize = A.start; + const A_last = A.length(); + const B_last = B.end; + + if (B.length() > 0 and A.length() > 0) { + while (true) { + if (!lessThan(context, items[B_index], cache[A_index])) { + items[insert_index] = cache[A_index]; + A_index += 1; + insert_index += 1; + if (A_index == A_last) break; + } else { + items[insert_index] = items[B_index]; + B_index += 1; + insert_index += 1; + if (B_index == B_last) break; + } + } + } + + // copy the remainder of A into the final array + const cache_a = cache[A_index..A_last]; + @memcpy(items[insert_index..][0..cache_a.len], cache_a); +} + +fn swap( + comptime T: type, + items: []T, + order: *[8]u8, + x: usize, + y: usize, + context: anytype, + comptime lessThan: fn (@TypeOf(context), lhs: T, rhs: T) bool, +) void { + if (lessThan(context, items[y], items[x]) or ((order.*)[x] > (order.*)[y] and !lessThan(context, items[x], items[y]))) { + mem.swap(T, &items[x], &items[y]); + mem.swap(u8, &(order.*)[x], &(order.*)[y]); + } +} diff --git a/lib/std/sort/pdq.zig b/lib/std/sort/pdq.zig new file mode 100644 index 0000000000..e7042b0c76 --- /dev/null +++ b/lib/std/sort/pdq.zig @@ -0,0 +1,331 @@ +const std = @import("../std.zig"); +const sort = std.sort; +const mem = std.mem; +const math = std.math; +const testing = std.testing; + +/// Unstable in-place sort. n best case, n*log(n) worst case and average case. +/// log(n) memory (no allocator required). +/// +/// Sorts in ascending order with respect to the given `lessThan` function. +pub fn pdq( + comptime T: type, + items: []T, + context: anytype, + comptime lessThanFn: fn (context: @TypeOf(context), lhs: T, rhs: T) bool, +) void { + const Context = struct { + items: []T, + sub_ctx: @TypeOf(context), + + pub fn lessThan(ctx: @This(), a: usize, b: usize) bool { + return lessThanFn(ctx.sub_ctx, ctx.items[a], ctx.items[b]); + } + + pub fn swap(ctx: @This(), a: usize, b: usize) void { + return mem.swap(T, &ctx.items[a], &ctx.items[b]); + } + }; + pdqContext(0, items.len, Context{ .items = items, .sub_ctx = context }); +} + +const Hint = enum { + increasing, + decreasing, + unknown, +}; + +/// Unstable in-place sort. O(n) best case, O(n*log(n)) worst case and average case. +/// O(log(n)) memory (no allocator required). +/// +/// Sorts in ascending order with respect to the given `lessThan` function. +pub fn pdqContext(a: usize, b: usize, context: anytype) void { + // slices of up to this length get sorted using insertion sort. + const max_insertion = 24; + // number of allowed imbalanced partitions before switching to heap sort. + const max_limit = std.math.floorPowerOfTwo(usize, b) + 1; + + // set upper bound on stack memory usage. + const Range = struct { a: usize, b: usize, limit: usize }; + const stack_size = math.log2(math.maxInt(usize) + 1); + var stack: [stack_size]Range = undefined; + var range = Range{ .a = a, .b = b, .limit = max_limit }; + var top: usize = 0; + + while (true) { + var was_balanced = true; + var was_partitioned = true; + + while (true) { + const len = range.b - range.a; + + // very short slices get sorted using insertion sort. + if (len <= max_insertion) { + break sort.insertionContext(range.a, range.b, context); + } + + // if too many bad pivot choices were made, simply fall back to heapsort in order to + // guarantee O(n*log(n)) worst-case. + if (range.limit == 0) { + break sort.heapContext(range.a, range.b, context); + } + + // if the last partitioning was imbalanced, try breaking patterns in the slice by shuffling + // some elements around. Hopefully we'll choose a better pivot this time. + if (!was_balanced) { + breakPatterns(range.a, range.b, context); + range.limit -= 1; + } + + // choose a pivot and try guessing whether the slice is already sorted. + var pivot: usize = 0; + var hint = chosePivot(range.a, range.b, &pivot, context); + + if (hint == .decreasing) { + // The maximum number of swaps was performed, so items are likely + // in reverse order. Reverse it to make sorting faster. + reverseRange(range.a, range.b, context); + pivot = (range.b - 1) - (pivot - range.a); + hint = .increasing; + } + + // if the last partitioning was decently balanced and didn't shuffle elements, and if pivot + // selection predicts the slice is likely already sorted... + if (was_balanced and was_partitioned and hint == .increasing) { + // try identifying several out-of-order elements and shifting them to correct + // positions. If the slice ends up being completely sorted, we're done. + if (partialInsertionSort(range.a, range.b, context)) break; + } + + // if the chosen pivot is equal to the predecessor, then it's the smallest element in the + // slice. Partition the slice into elements equal to and elements greater than the pivot. + // This case is usually hit when the slice contains many duplicate elements. + if (range.a > 0 and !context.lessThan(range.a - 1, pivot)) { + range.a = partitionEqual(range.a, range.b, pivot, context); + continue; + } + + // partition the slice. + var mid = pivot; + was_partitioned = partition(range.a, range.b, &mid, context); + + const left_len = mid - range.a; + const right_len = range.b - mid; + const balanced_threshold = len / 8; + if (left_len < right_len) { + was_balanced = left_len >= balanced_threshold; + stack[top] = .{ .a = range.a, .b = mid, .limit = range.limit }; + top += 1; + range.a = mid + 1; + } else { + was_balanced = right_len >= balanced_threshold; + stack[top] = .{ .a = mid + 1, .b = range.b, .limit = range.limit }; + top += 1; + range.b = mid; + } + } + + top = math.sub(usize, top, 1) catch break; + range = stack[top]; + } +} + +/// partitions `items[a..b]` into elements smaller than `items[pivot]`, +/// followed by elements greater than or equal to `items[pivot]`. +/// +/// sets the new pivot. +/// returns `true` if already partitioned. +fn partition(a: usize, b: usize, pivot: *usize, context: anytype) bool { + // move pivot to the first place + context.swap(a, pivot.*); + + var i = a + 1; + var j = b - 1; + + while (i <= j and context.lessThan(i, a)) i += 1; + while (i <= j and !context.lessThan(j, a)) j -= 1; + + // check if items are already partitioned (no item to swap) + if (i > j) { + // put pivot back to the middle + context.swap(j, a); + pivot.* = j; + return true; + } + + context.swap(i, j); + i += 1; + j -= 1; + + while (true) { + while (i <= j and context.lessThan(i, a)) i += 1; + while (i <= j and !context.lessThan(j, a)) j -= 1; + if (i > j) break; + + context.swap(i, j); + i += 1; + j -= 1; + } + + // TODO: Enable the BlockQuicksort optimization + + context.swap(j, a); + pivot.* = j; + return false; +} + +/// partitions items into elements equal to `items[pivot]` +/// followed by elements greater than `items[pivot]`. +/// +/// it assumed that `items[a..b]` does not contain elements smaller than the `items[pivot]`. +fn partitionEqual(a: usize, b: usize, pivot: usize, context: anytype) usize { + // move pivot to the first place + context.swap(a, pivot); + + var i = a + 1; + var j = b - 1; + + while (true) { + while (i <= j and !context.lessThan(a, i)) i += 1; + while (i <= j and context.lessThan(a, j)) j -= 1; + if (i > j) break; + + context.swap(i, j); + i += 1; + j -= 1; + } + + return i; +} + +/// partially sorts a slice by shifting several out-of-order elements around. +/// +/// returns `true` if the slice is sorted at the end. This function is `O(n)` worst-case. +fn partialInsertionSort(a: usize, b: usize, context: anytype) bool { + @setCold(true); + + // maximum number of adjacent out-of-order pairs that will get shifted + const max_steps = 5; + // if the slice is shorter than this, don't shift any elements + const shortest_shifting = 50; + + var i = a + 1; + for (0..max_steps) |_| { + // find the next pair of adjacent out-of-order elements. + while (i < b and !context.lessThan(i, i - 1)) i += 1; + + // are we done? + if (i == b) return true; + + // don't shift elements on short arrays, that has a performance cost. + if (b - a < shortest_shifting) return false; + + // swap the found pair of elements. This puts them in correct order. + context.swap(i, i - 1); + + // shift the smaller element to the left. + if (i - a >= 2) { + var j = i - 1; + while (j >= 1) : (j -= 1) { + if (!context.lessThan(j, j - 1)) break; + context.swap(j, j - 1); + } + } + + // shift the greater element to the right. + if (b - i >= 2) { + var j = i + 1; + while (j < b) : (j += 1) { + if (!context.lessThan(j, j - 1)) break; + context.swap(j, j - 1); + } + } + } + + return false; +} + +fn breakPatterns(a: usize, b: usize, context: anytype) void { + @setCold(true); + + const len = b - a; + if (len < 8) return; + + var rand = @intCast(u64, len); + const modulus = math.ceilPowerOfTwoAssert(u64, len); + + var i = a + (len / 4) * 2 - 1; + while (i <= a + (len / 4) * 2 + 1) : (i += 1) { + // xorshift64 + rand ^= rand << 13; + rand ^= rand >> 7; + rand ^= rand << 17; + + var other = @intCast(usize, rand & (modulus - 1)); + if (other >= len) other -= len; + context.swap(i, a + other); + } +} + +/// choses a pivot in `items[a..b]`. +/// swaps likely_sorted when `items[a..b]` seems to be already sorted. +fn chosePivot(a: usize, b: usize, pivot: *usize, context: anytype) Hint { + // minimum length for using the Tukey's ninther method + const shortest_ninther = 50; + // max_swaps is the maximum number of swaps allowed in this function + const max_swaps = 4 * 3; + + var len = b - a; + var i = a + len / 4 * 1; + var j = a + len / 4 * 2; + var k = a + len / 4 * 3; + var swaps: usize = 0; + + if (len >= 8) { + if (len >= shortest_ninther) { + // find medians in the neighborhoods of `i`, `j` and `k` + i = sort3(i - 1, i, i + 1, &swaps, context); + j = sort3(j - 1, j, j + 1, &swaps, context); + k = sort3(k - 1, k, k + 1, &swaps, context); + } + + // find the median among `i`, `j` and `k` + j = sort3(i, j, k, &swaps, context); + } + + pivot.* = j; + return switch (swaps) { + 0 => .increasing, + max_swaps => .decreasing, + else => .unknown, + }; +} + +fn sort3(a: usize, b: usize, c: usize, swaps: *usize, context: anytype) usize { + if (context.lessThan(b, a)) { + swaps.* += 1; + context.swap(b, a); + } + + if (context.lessThan(c, b)) { + swaps.* += 1; + context.swap(c, b); + } + + if (context.lessThan(b, a)) { + swaps.* += 1; + context.swap(b, a); + } + + return b; +} + +fn reverseRange(a: usize, b: usize, context: anytype) void { + var i = a; + var j = b - 1; + while (i < j) { + context.swap(i, j); + i += 1; + j -= 1; + } +} diff --git a/src/Compilation.zig b/src/Compilation.zig index b485800329..cc2e2a916b 100644 --- a/src/Compilation.zig +++ b/src/Compilation.zig @@ -672,7 +672,7 @@ fn addPackageTableToCacheHash( } } // Sort the slice by package name - std.sort.sort(Package.Table.KV, packages, {}, struct { + mem.sort(Package.Table.KV, packages, {}, struct { fn lessThan(_: void, lhs: Package.Table.KV, rhs: Package.Table.KV) bool { return std.mem.lessThan(u8, lhs.key, rhs.key); } diff --git a/src/Package.zig b/src/Package.zig index f28aac885d..cde3f38e28 100644 --- a/src/Package.zig +++ b/src/Package.zig @@ -672,7 +672,7 @@ fn computePackageHash( } } - std.sort.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan); + mem.sort(*HashedFile, all_files.items, {}, HashedFile.lessThan); var hasher = Manifest.Hash.init(.{}); var any_failures = false; diff --git a/src/RangeSet.zig b/src/RangeSet.zig index 7e501f984b..aa051ff424 100644 --- a/src/RangeSet.zig +++ b/src/RangeSet.zig @@ -60,7 +60,7 @@ pub fn spans(self: *RangeSet, first: Value, last: Value, ty: Type) !bool { if (self.ranges.items.len == 0) return false; - std.sort.sort(Range, self.ranges.items, LessThanContext{ + std.mem.sort(Range, self.ranges.items, LessThanContext{ .ty = ty, .module = self.module, }, lessThan); diff --git a/src/Sema.zig b/src/Sema.zig index 9178392d27..76c9891467 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -30979,7 +30979,7 @@ fn resolveStructLayout(sema: *Sema, ty: Type) CompileError!void { ctx.struct_obj.fields.values()[b].ty.abiAlignment(target); } }; - std.sort.sort(u32, optimized_order, AlignSortContext{ + mem.sort(u32, optimized_order, AlignSortContext{ .struct_obj = struct_obj, .sema = sema, }, AlignSortContext.lessThan); diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index e835242379..55a9694fd3 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2176,7 +2176,7 @@ fn computeFrameLayout(self: *Self) !FrameLayout { } }; const sort_context = SortContext{ .frame_align = frame_align }; - std.sort.sort(FrameIndex, stack_frame_order, sort_context, SortContext.lessThan); + mem.sort(FrameIndex, stack_frame_order, sort_context, SortContext.lessThan); } const call_frame_align = frame_align[@enumToInt(FrameIndex.call_frame)]; diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 6ed0aeeff4..625a5283b9 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -770,7 +770,7 @@ const mnemonic_to_encodings_map = init: { @setEvalBranchQuota(30_000); const encodings = @import("encodings.zig"); var entries = encodings.table; - std.sort.sort(encodings.Entry, &entries, {}, struct { + std.mem.sort(encodings.Entry, &entries, {}, struct { fn lessThan(_: void, lhs: encodings.Entry, rhs: encodings.Entry) bool { return @enumToInt(lhs[0]) < @enumToInt(rhs[0]); } diff --git a/src/codegen/c/type.zig b/src/codegen/c/type.zig index 892914ea3d..8494ae7353 100644 --- a/src/codegen/c/type.zig +++ b/src/codegen/c/type.zig @@ -1292,7 +1292,7 @@ pub const CType = extern union { fn sortFields(self: *@This(), fields_len: usize) []Payload.Fields.Field { const Field = Payload.Fields.Field; const slice = self.storage.anon.fields[0..fields_len]; - std.sort.sort(Field, slice, {}, struct { + mem.sort(Field, slice, {}, struct { fn before(_: void, lhs: Field, rhs: Field) bool { return lhs.alignas.@"align" > rhs.alignas.@"align"; } diff --git a/src/link/Coff.zig b/src/link/Coff.zig index 81e8c57bdd..01f18a73b3 100644 --- a/src/link/Coff.zig +++ b/src/link/Coff.zig @@ -1837,7 +1837,7 @@ fn writeBaseRelocations(self: *Coff) !void { pages.appendAssumeCapacity(page.*); } } - std.sort.sort(u32, pages.items, {}, std.sort.asc(u32)); + mem.sort(u32, pages.items, {}, std.sort.asc(u32)); var buffer = std.ArrayList(u8).init(gpa); defer buffer.deinit(); diff --git a/src/link/MachO/Object.zig b/src/link/MachO/Object.zig index 7cc6f78c7d..b218fdbd2d 100644 --- a/src/link/MachO/Object.zig +++ b/src/link/MachO/Object.zig @@ -209,7 +209,7 @@ pub fn parse(self: *Object, allocator: Allocator, cpu_arch: std.Target.Cpu.Arch) // afterwards by address in each group. Normally, dysymtab should // be enough to guarantee the sort, but turns out not every compiler // is kind enough to specify the symbols in the correct order. - sort.sort(SymbolAtIndex, sorted_all_syms.items, self, SymbolAtIndex.lessThan); + mem.sort(SymbolAtIndex, sorted_all_syms.items, self, SymbolAtIndex.lessThan); var prev_sect_id: u8 = 0; var section_index_lookup: ?Entry = null; @@ -462,7 +462,7 @@ pub fn splitRegularSections(self: *Object, zld: *Zld, object_id: u32) !void { sorted_sections[id] = .{ .header = sect, .id = @intCast(u8, id) }; } - std.sort.sort(SortedSection, sorted_sections, {}, sectionLessThanByAddress); + mem.sort(SortedSection, sorted_sections, {}, sectionLessThanByAddress); var sect_sym_index: u32 = 0; for (sorted_sections) |section| { @@ -663,7 +663,7 @@ fn parseRelocs(self: *Object, gpa: Allocator, sect_id: u8) !void { if (self.getSourceRelocs(section)) |relocs| { try self.relocations.ensureUnusedCapacity(gpa, relocs.len); self.relocations.appendUnalignedSliceAssumeCapacity(relocs); - std.sort.sort(macho.relocation_info, self.relocations.items[start..], {}, relocGreaterThan); + mem.sort(macho.relocation_info, self.relocations.items[start..], {}, relocGreaterThan); } self.section_relocs_lookup.items[sect_id] = start; } @@ -901,7 +901,7 @@ pub fn parseDataInCode(self: *Object, gpa: Allocator) !void { const dice = @ptrCast([*]align(1) const macho.data_in_code_entry, self.contents.ptr + cmd.dataoff)[0..ndice]; try self.data_in_code.ensureTotalCapacityPrecise(gpa, dice.len); self.data_in_code.appendUnalignedSliceAssumeCapacity(dice); - std.sort.sort(macho.data_in_code_entry, self.data_in_code.items, {}, diceLessThan); + mem.sort(macho.data_in_code_entry, self.data_in_code.items, {}, diceLessThan); } fn diceLessThan(ctx: void, lhs: macho.data_in_code_entry, rhs: macho.data_in_code_entry) bool { diff --git a/src/link/MachO/UnwindInfo.zig b/src/link/MachO/UnwindInfo.zig index 0071657f8b..8d2a36be9d 100644 --- a/src/link/MachO/UnwindInfo.zig +++ b/src/link/MachO/UnwindInfo.zig @@ -411,7 +411,7 @@ pub fn collect(info: *UnwindInfo, zld: *Zld) !void { } var slice = common_encodings_counts.values(); - std.sort.sort(CommonEncWithCount, slice, {}, CommonEncWithCount.greaterThan); + mem.sort(CommonEncWithCount, slice, {}, CommonEncWithCount.greaterThan); var i: u7 = 0; while (i < slice.len) : (i += 1) { diff --git a/src/link/MachO/dyld_info/Rebase.zig b/src/link/MachO/dyld_info/Rebase.zig index 1d7a0c94c0..5b386a8136 100644 --- a/src/link/MachO/dyld_info/Rebase.zig +++ b/src/link/MachO/dyld_info/Rebase.zig @@ -39,7 +39,7 @@ pub fn finalize(rebase: *Rebase, gpa: Allocator) !void { const writer = rebase.buffer.writer(gpa); - std.sort.sort(Entry, rebase.entries.items, {}, Entry.lessThan); + std.mem.sort(Entry, rebase.entries.items, {}, Entry.lessThan); try setTypePointer(writer); diff --git a/src/link/MachO/dyld_info/bind.zig b/src/link/MachO/dyld_info/bind.zig index 98a693920a..14ce1587aa 100644 --- a/src/link/MachO/dyld_info/bind.zig +++ b/src/link/MachO/dyld_info/bind.zig @@ -47,7 +47,7 @@ pub fn Bind(comptime Ctx: type, comptime Target: type) type { const writer = self.buffer.writer(gpa); - std.sort.sort(Entry, self.entries.items, ctx, Entry.lessThan); + std.mem.sort(Entry, self.entries.items, ctx, Entry.lessThan); var start: usize = 0; var seg_id: ?u8 = null; diff --git a/src/link/MachO/zld.zig b/src/link/MachO/zld.zig index 7e6870ecbc..b151aee19b 100644 --- a/src/link/MachO/zld.zig +++ b/src/link/MachO/zld.zig @@ -1441,7 +1441,7 @@ pub const Zld = struct { } } - std.sort.sort(Section, sections.items, {}, SortSection.lessThan); + mem.sort(Section, sections.items, {}, SortSection.lessThan); self.sections.shrinkRetainingCapacity(0); for (sections.items) |out| { @@ -2237,7 +2237,7 @@ pub const Zld = struct { } } - std.sort.sort(u64, addresses.items, {}, asc_u64); + mem.sort(u64, addresses.items, {}, asc_u64); var offsets = std.ArrayList(u32).init(gpa); defer offsets.deinit(); diff --git a/src/link/Wasm.zig b/src/link/Wasm.zig index cd9c44d656..5dfc91d4ce 100644 --- a/src/link/Wasm.zig +++ b/src/link/Wasm.zig @@ -2143,7 +2143,7 @@ fn sortDataSegments(wasm: *Wasm) !void { } }; - std.sort.sort([]const u8, keys, {}, SortContext.sort); + mem.sort([]const u8, keys, {}, SortContext.sort); for (keys) |key| { const segment_index = wasm.data_segments.get(key).?; new_mapping.putAssumeCapacity(key, segment_index); @@ -2187,7 +2187,7 @@ fn setupInitFunctions(wasm: *Wasm) !void { } // sort the initfunctions based on their priority - std.sort.sort(InitFuncLoc, wasm.init_funcs.items, {}, InitFuncLoc.lessThan); + mem.sort(InitFuncLoc, wasm.init_funcs.items, {}, InitFuncLoc.lessThan); } /// Generates an atom containing the global error set' size. @@ -3687,7 +3687,7 @@ fn writeToFile( } }.sort; - std.sort.sort(*Atom, sorted_atoms.items, wasm, atom_sort_fn); + mem.sort(*Atom, sorted_atoms.items, wasm, atom_sort_fn); for (sorted_atoms.items) |sorted_atom| { try leb.writeULEB128(binary_writer, sorted_atom.size); @@ -4050,8 +4050,8 @@ fn emitNameSection(wasm: *Wasm, binary_bytes: *std.ArrayList(u8), arena: std.mem data_segment_index += 1; } - std.sort.sort(Name, funcs.values(), {}, Name.lessThan); - std.sort.sort(Name, globals.items, {}, Name.lessThan); + mem.sort(Name, funcs.values(), {}, Name.lessThan); + mem.sort(Name, globals.items, {}, Name.lessThan); const header_offset = try reserveCustomSectionHeader(binary_bytes); const writer = binary_bytes.writer(); diff --git a/src/objcopy.zig b/src/objcopy.zig index 12129aba9c..c5d0e8dcb3 100644 --- a/src/objcopy.zig +++ b/src/objcopy.zig @@ -402,7 +402,7 @@ const BinaryElfOutput = struct { } } - std.sort.sort(*BinaryElfSegment, self.segments.items, {}, segmentSortCompare); + mem.sort(*BinaryElfSegment, self.segments.items, {}, segmentSortCompare); for (self.segments.items, 0..) |firstSegment, i| { if (firstSegment.firstSection) |firstSection| { @@ -427,7 +427,7 @@ const BinaryElfOutput = struct { } } - std.sort.sort(*BinaryElfSection, self.sections.items, {}, sectionSortCompare); + mem.sort(*BinaryElfSection, self.sections.items, {}, sectionSortCompare); return self; } diff --git a/test/src/Cases.zig b/test/src/Cases.zig index 0451079a0e..63dd2fd3da 100644 --- a/test/src/Cases.zig +++ b/test/src/Cases.zig @@ -607,7 +607,7 @@ fn sortTestFilenames(filenames: [][]const u8) void { }; } }; - std.sort.sort([]const u8, filenames, Context{}, Context.lessThan); + std.mem.sort([]const u8, filenames, Context{}, Context.lessThan); } /// Iterates a set of filenames extracting batches that are either incremental diff --git a/tools/gen_stubs.zig b/tools/gen_stubs.zig index bc2637e197..95787b719a 100644 --- a/tools/gen_stubs.zig +++ b/tools/gen_stubs.zig @@ -437,7 +437,7 @@ fn parseElf(parse: Parse, comptime is_64: bool, comptime endian: builtin.Endian) const dynstr = elf_bytes[dynstr_offset..]; // Sort the list by address, ascending. - std.sort.sort(Sym, @alignCast(8, dyn_syms), {}, S.symbolAddrLessThan); + mem.sort(Sym, @alignCast(8, dyn_syms), {}, S.symbolAddrLessThan); for (dyn_syms) |sym| { const this_section = s(sym.st_shndx); diff --git a/tools/generate_JSONTestSuite.zig b/tools/generate_JSONTestSuite.zig index b8550959c7..2229cf4012 100644 --- a/tools/generate_JSONTestSuite.zig +++ b/tools/generate_JSONTestSuite.zig @@ -23,7 +23,7 @@ pub fn main() !void { while (try it.next()) |entry| { try names.append(try allocator.dupe(u8, entry.name)); } - std.sort.sort([]const u8, names.items, {}, (struct { + std.mem.sort([]const u8, names.items, {}, (struct { fn lessThan(_: void, a: []const u8, b: []const u8) bool { return std.mem.lessThan(u8, a, b); } diff --git a/tools/process_headers.zig b/tools/process_headers.zig index a6550a2573..0321c0e0eb 100644 --- a/tools/process_headers.zig +++ b/tools/process_headers.zig @@ -460,7 +460,7 @@ pub fn main() !void { try contents_list.append(contents); } } - std.sort.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan); + std.mem.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan); const best_contents = contents_list.popOrNull().?; if (best_contents.hit_count > 1) { // worth it to make it generic diff --git a/tools/update-linux-headers.zig b/tools/update-linux-headers.zig index 38fbab6645..0f31e5e893 100644 --- a/tools/update-linux-headers.zig +++ b/tools/update-linux-headers.zig @@ -260,7 +260,7 @@ pub fn main() !void { try contents_list.append(contents); } } - std.sort.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan); + std.mem.sort(*Contents, contents_list.items, {}, Contents.hitCountLessThan); const best_contents = contents_list.popOrNull().?; if (best_contents.hit_count > 1) { // worth it to make it generic diff --git a/tools/update_clang_options.zig b/tools/update_clang_options.zig index 682ec7e152..feefeb0a83 100644 --- a/tools/update_clang_options.zig +++ b/tools/update_clang_options.zig @@ -646,7 +646,7 @@ pub fn main() anyerror!void { } // Some options have multiple matches. As an example, "-Wl,foo" matches both // "W" and "Wl,". So we sort this list in order of descending priority. - std.sort.sort(*json.ObjectMap, all_objects.items, {}, objectLessThan); + std.mem.sort(*json.ObjectMap, all_objects.items, {}, objectLessThan); var buffered_stdout = std.io.bufferedWriter(std.io.getStdOut().writer()); const stdout = buffered_stdout.writer(); diff --git a/tools/update_cpu_features.zig b/tools/update_cpu_features.zig index 53bb365f41..d5c3d48852 100644 --- a/tools/update_cpu_features.zig +++ b/tools/update_cpu_features.zig @@ -1187,8 +1187,8 @@ fn processOneTarget(job: Job) anyerror!void { for (llvm_target.extra_cpus) |extra_cpu| { try all_cpus.append(extra_cpu); } - std.sort.sort(Feature, all_features.items, {}, featureLessThan); - std.sort.sort(Cpu, all_cpus.items, {}, cpuLessThan); + mem.sort(Feature, all_features.items, {}, featureLessThan); + mem.sort(Cpu, all_cpus.items, {}, cpuLessThan); const target_sub_path = try fs.path.join(arena, &.{ "lib", "std", "target" }); var target_dir = try job.zig_src_dir.makeOpenPath(target_sub_path, .{}); @@ -1283,7 +1283,7 @@ fn processOneTarget(job: Job) anyerror!void { try dependencies.append(key.*); } } - std.sort.sort([]const u8, dependencies.items, {}, asciiLessThan); + mem.sort([]const u8, dependencies.items, {}, asciiLessThan); if (dependencies.items.len == 0) { try w.writeAll( @@ -1328,7 +1328,7 @@ fn processOneTarget(job: Job) anyerror!void { try cpu_features.append(key.*); } } - std.sort.sort([]const u8, cpu_features.items, {}, asciiLessThan); + mem.sort([]const u8, cpu_features.items, {}, asciiLessThan); if (cpu.llvm_name) |llvm_name| { try w.print( \\ pub const {} = CpuModel{{ diff --git a/tools/update_spirv_features.zig b/tools/update_spirv_features.zig index 8d398f58de..44d8b6a445 100644 --- a/tools/update_spirv_features.zig +++ b/tools/update_spirv_features.zig @@ -303,7 +303,7 @@ fn gatherVersions(allocator: Allocator, registry: g.CoreRegistry) ![]const Versi } } - std.sort.sort(Version, versions.items, {}, Version.lessThan); + std.mem.sort(Version, versions.items, {}, Version.lessThan); return versions.items; } -- cgit v1.2.3