diff options
| author | Jacob Young <jacobly0@users.noreply.github.com> | 2023-05-07 20:42:46 -0400 |
|---|---|---|
| committer | Jacob Young <jacobly0@users.noreply.github.com> | 2023-05-08 07:36:20 -0400 |
| commit | 6778da4516e68c271cb50fe9c252ab4084daf16b (patch) | |
| tree | 46e3bb0082569ecb678039e36fa2a422ae02a994 /src | |
| parent | f8708e2c4d93eece5b3e131fd2d1b5b210806cd6 (diff) | |
| download | zig-6778da4516e68c271cb50fe9c252ab4084daf16b.tar.gz zig-6778da4516e68c271cb50fe9c252ab4084daf16b.zip | |
x86_64: implement binary operations for `f16` and `f16` vectors
Diffstat (limited to 'src')
| -rw-r--r-- | src/arch/x86_64/CodeGen.zig | 261 | ||||
| -rw-r--r-- | src/arch/x86_64/Encoding.zig | 23 | ||||
| -rw-r--r-- | src/arch/x86_64/Lower.zig | 22 | ||||
| -rw-r--r-- | src/arch/x86_64/Mir.zig | 44 | ||||
| -rw-r--r-- | src/arch/x86_64/encodings.zig | 20 |
5 files changed, 339 insertions, 31 deletions
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index 3e2d418105..154b909a21 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -4497,14 +4497,15 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void { const tag = self.air.instructions.items(.tag)[inst]; try self.genBinOpMir(switch (ty_bits) { // No point using an extra prefix byte for *pd which performs the same operation. - 32, 64 => switch (tag) { + 16, 32, 64, 128 => switch (tag) { .neg => .xorps, .fabs => .andnps, else => unreachable, }, - else => return self.fail("TODO implement airFloatSign for {}", .{ + 80 => return self.fail("TODO implement airFloatSign for {}", .{ ty.fmt(self.bin_file.options.module.?), }), + else => unreachable, }, vec_ty, dst_mcv, sign_mcv); return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none }); } @@ -6112,9 +6113,53 @@ fn genBinOp( return dst_mcv; } + const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size); const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) { else => unreachable, .Float => switch (lhs_ty.floatBits(self.target.*)) { + 16 => if (self.hasFeature(.f16c)) { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .vpinsrw, + dst_reg, + dst_reg, + src_mcv.mem(.word), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vpunpcklwd, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddss, + .sub => .vsubss, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivss, + .max => .vmaxss, + .min => .vmaxss, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + } else null, 32 => switch (air_tag) { .add => if (self.hasFeature(.avx)) .vaddss else .addss, .sub => if (self.hasFeature(.avx)) .vsubss else .subss, @@ -6141,12 +6186,178 @@ fn genBinOp( .min => if (self.hasFeature(.avx)) .vminsd else .minsd, else => unreachable, }, - 16, 80, 128 => null, + 80, 128 => null, else => unreachable, }, .Vector => switch (lhs_ty.childType().zigTypeTag()) { else => null, .Float => switch (lhs_ty.childType().floatBits(self.target.*)) { + 16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) { + 1 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .vpinsrw, + dst_reg, + dst_reg, + src_mcv.mem(.word), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vpunpcklwd, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddss, + .sub => .vsubss, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivss, + .max => .vmaxss, + .min => .vmaxss, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + 2 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate( + .vpinsrd, + dst_reg, + src_mcv.mem(.dword), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vunpcklps, + dst_reg, + dst_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + try self.asmRegisterRegisterRegister(.vmovhlps, tmp_reg, dst_reg, dst_reg); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vmaxps, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + 3...4 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .vcvtph2ps, + tmp_reg, + src_mcv.mem(.qword), + ) else try self.asmRegisterRegister( + .vcvtph2ps, + tmp_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vmaxps, + else => unreachable, + }, + dst_reg, + dst_reg, + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg, + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + 5...8 => { + const tmp_reg = (try self.register_manager.allocReg(null, sse)).to256(); + const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg); + defer self.register_manager.unlockReg(tmp_lock); + + try self.asmRegisterRegister(.vcvtph2ps, dst_reg.to256(), dst_reg); + if (src_mcv.isMemory()) try self.asmRegisterMemory( + .vcvtph2ps, + tmp_reg, + src_mcv.mem(.xword), + ) else try self.asmRegisterRegister( + .vcvtph2ps, + tmp_reg, + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(), + ); + try self.asmRegisterRegisterRegister( + switch (air_tag) { + .add => .vaddps, + .sub => .vsubps, + .div_float, .div_trunc, .div_floor, .div_exact => .vdivps, + .max => .vmaxps, + .min => .vmaxps, + else => unreachable, + }, + dst_reg.to256(), + dst_reg.to256(), + tmp_reg, + ); + try self.asmRegisterRegisterImmediate( + .vcvtps2ph, + dst_reg, + dst_reg.to256(), + Immediate.u(0b1_00), + ); + return dst_mcv; + }, + else => null, + } else null, 32 => switch (lhs_ty.vectorLen()) { 1 => switch (air_tag) { .add => if (self.hasFeature(.avx)) .vaddss else .addss, @@ -6223,14 +6434,13 @@ fn genBinOp( } else null, else => null, }, - 16, 80, 128 => null, + 80, 128 => null, else => unreachable, }, }, })) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{ @tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?), }); - const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size); if (self.hasFeature(.avx)) { const src1_alias = if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size); @@ -7139,21 +7349,21 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void { const tmp2_lock = self.register_manager.lockRegAssumeUnused(tmp2_reg); defer self.register_manager.unlockReg(tmp2_lock); - if (src_mcv.isRegister()) - try self.asmRegisterRegisterRegister( - .vpunpcklwd, - tmp1_reg, - dst_reg.to128(), - src_mcv.getReg().?.to128(), - ) - else - try self.asmRegisterRegisterMemoryImmediate( - .vpinsrw, - tmp1_reg, - dst_reg.to128(), - src_mcv.mem(.word), - Immediate.u(1), - ); + if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate( + .vpinsrw, + tmp1_reg, + dst_reg.to128(), + src_mcv.mem(.word), + Immediate.u(1), + ) else try self.asmRegisterRegisterRegister( + .vpunpcklwd, + tmp1_reg, + dst_reg.to128(), + (if (src_mcv.isRegister()) + src_mcv.getReg().? + else + try self.copyToTmpRegister(ty, src_mcv)).to128(), + ); try self.asmRegisterRegister(.vcvtph2ps, tmp1_reg, tmp1_reg); try self.asmRegisterRegister(.vmovshdup, tmp2_reg, tmp1_reg); try self.genBinOpMir(.ucomiss, ty, tmp1_mcv, tmp2_mcv); @@ -8139,7 +8349,16 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag { }, .Vector => switch (ty.childType().zigTypeTag()) { .Float => switch (ty.childType().floatBits(self.target.*)) { - 16 => unreachable, // needs special handling + 16 => switch (ty.vectorLen()) { + 1 => unreachable, // needs special handling + 2 => return if (self.hasFeature(.avx)) .vmovss else .movss, + 3...4 => return if (self.hasFeature(.avx)) .vmovsd else .movsd, + 5...8 => return if (self.hasFeature(.avx)) + if (aligned) .vmovaps else .vmovups + else if (aligned) .movaps else .movups, + 9...16 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups, + else => {}, + }, 32 => switch (ty.vectorLen()) { 1 => return if (self.hasFeature(.avx)) .vmovss else .movss, 2...4 => return if (self.hasFeature(.avx)) diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index b8ccc9efba..3235b29358 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -270,7 +270,7 @@ pub const Mnemonic = enum { divps, divss, maxps, maxss, minps, minss, - movaps, movss, movups, + movaps, movhlps, movss, movups, mulps, mulss, orps, pextrw, pinsrw, @@ -303,6 +303,8 @@ pub const Mnemonic = enum { // SSE3 movddup, movshdup, movsldup, // SSE4.1 + pextrb, pextrd, pextrq, + pinsrb, pinsrd, pinsrq, roundpd, roundps, roundsd, roundss, // AVX vaddpd, vaddps, vaddsd, vaddss, @@ -311,13 +313,14 @@ pub const Mnemonic = enum { vmaxpd, vmaxps, vmaxsd, vmaxss, vminpd, vminps, vminsd, vminss, vmovapd, vmovaps, - vmovddup, + vmovddup, vmovhlps, vmovsd, vmovshdup, vmovsldup, vmovss, vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, - vpextrw, vpinsrw, + vpextrb, vpextrd, vpextrq, vpextrw, + vpinsrb, vpinsrd, vpinsrq, vpinsrw, vpshufhw, vpshuflw, vpsrld, vpsrlq, vpsrlw, vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd, @@ -359,7 +362,7 @@ pub const Op = enum { cl, r8, r16, r32, r64, rm8, rm16, rm32, rm64, - r32_m16, r64_m16, + r32_m8, r32_m16, r64_m16, m8, m16, m32, m64, m80, m128, m256, rel8, rel16, rel32, m, @@ -444,7 +447,7 @@ pub const Op = enum { pub fn immBitSize(op: Op) u64 { return switch (op) { .none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable, - .al, .cl, .r8, .rm8 => unreachable, + .al, .cl, .r8, .rm8, .r32_m8 => unreachable, .ax, .r16, .rm16 => unreachable, .eax, .r32, .rm32, .r32_m16 => unreachable, .rax, .r64, .rm64, .r64_m16 => unreachable, @@ -467,7 +470,7 @@ pub const Op = enum { .m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable, .al, .cl, .r8, .rm8 => 8, .ax, .r16, .rm16 => 16, - .eax, .r32, .rm32, .r32_m16 => 32, + .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32, .rax, .r64, .rm64, .r64_m16 => 64, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128, .ymm, .ymm_m256 => 256, @@ -480,7 +483,7 @@ pub const Op = enum { .unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable, .rel8, .rel16, .rel32 => unreachable, .al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable, - .m8, .rm8 => 8, + .m8, .rm8, .r32_m8 => 8, .m16, .rm16, .r32_m16, .r64_m16 => 16, .m32, .rm32, .xmm_m32 => 32, .m64, .rm64, .xmm_m64 => 64, @@ -509,7 +512,7 @@ pub const Op = enum { .al, .ax, .eax, .rax, .r8, .r16, .r32, .r64, .rm8, .rm16, .rm32, .rm64, - .r32_m16, .r64_m16, + .r32_m8, .r32_m16, .r64_m16, .xmm, .xmm_m32, .xmm_m64, .xmm_m128, .ymm, .ymm_m256, => true, @@ -535,7 +538,7 @@ pub const Op = enum { // zig fmt: off return switch (op) { .rm8, .rm16, .rm32, .rm64, - .r32_m16, .r64_m16, + .r32_m8, .r32_m16, .r64_m16, .m8, .m16, .m32, .m64, .m80, .m128, .m256, .m, .xmm_m32, .xmm_m64, .xmm_m128, @@ -559,7 +562,7 @@ pub const Op = enum { .al, .ax, .eax, .rax, .cl => .general_purpose, .r8, .r16, .r32, .r64 => .general_purpose, .rm8, .rm16, .rm32, .rm64 => .general_purpose, - .r32_m16, .r64_m16 => .general_purpose, + .r32_m8, .r32_m16, .r64_m16 => .general_purpose, .sreg => .segment, .xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point, .ymm, .ymm_m256 => .floating_point, diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig index 2cfa25ac84..5c079f4768 100644 --- a/src/arch/x86_64/Lower.zig +++ b/src/arch/x86_64/Lower.zig @@ -137,6 +137,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .minps, .minss, .movaps, + .movhlps, .movss, .movups, .mulps, @@ -149,6 +150,8 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .subps, .subss, .ucomiss, + .unpckhps, + .unpcklps, .xorps, .addpd, @@ -187,12 +190,20 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .subpd, .subsd, .ucomisd, + .unpckhpd, + .unpcklpd, .xorpd, .movddup, .movshdup, .movsldup, + .pextrb, + .pextrd, + .pextrq, + .pinsrb, + .pinsrd, + .pinsrq, .roundpd, .roundps, .roundsd, @@ -221,6 +232,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vmovapd, .vmovaps, .vmovddup, + .vmovhlps, .vmovsd, .vmovshdup, .vmovsldup, @@ -231,7 +243,13 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vmulps, .vmulsd, .vmulss, + .vpextrb, + .vpextrd, + .vpextrq, .vpextrw, + .vpinsrb, + .vpinsrd, + .vpinsrq, .vpinsrw, .vpshufhw, .vpshuflw, @@ -258,6 +276,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct { .vsubps, .vsubsd, .vsubss, + .vunpckhpd, + .vunpckhps, + .vunpcklpd, + .vunpcklps, .vcvtph2ps, .vcvtps2ph, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index c0450406cf..442cfabebb 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -192,6 +192,8 @@ pub const Inst = struct { minss, /// Move aligned packed single-precision floating-point values movaps, + /// Move packed single-precision floating-point values high to low + movhlps, /// Move scalar single-precision floating-point value movss, /// Move unaligned packed single-precision floating-point values @@ -216,6 +218,10 @@ pub const Inst = struct { subss, /// Unordered compare scalar single-precision floating-point values ucomiss, + /// Unpack and interleave high packed single-precision floating-point values + unpckhps, + /// Unpack and interleave low packed single-precision floating-point values + unpcklps, /// Bitwise logical xor of packed single precision floating-point values xorps, @@ -291,6 +297,10 @@ pub const Inst = struct { subsd, /// Unordered compare scalar double-precision floating-point values ucomisd, + /// Unpack and interleave high packed double-precision floating-point values + unpckhpd, + /// Unpack and interleave low packed double-precision floating-point values + unpcklpd, /// Bitwise logical xor of packed double precision floating-point values xorpd, @@ -301,6 +311,18 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, + /// Extract Byte + pextrb, + /// Extract Doubleword + pextrd, + /// Extract Quadword + pextrq, + /// Insert Byte + pinsrb, + /// Insert Doubleword + pinsrd, + /// Insert Quadword + pinsrq, /// Round packed double-precision floating-point values roundpd, /// Round packed single-precision floating-point values @@ -354,6 +376,8 @@ pub const Inst = struct { vmovapd, /// Move aligned packed single-precision floating-point values vmovaps, + /// Move packed single-precision floating-point values high to low + vmovhlps, /// Replicate double floating-point values vmovddup, /// Move or merge scalar double-precision floating-point value @@ -376,8 +400,20 @@ pub const Inst = struct { vmulsd, /// Multiply scalar single-precision floating-point values vmulss, + /// Extract Byte + vpextrb, + /// Extract Doubleword + vpextrd, + /// Extract Quadword + vpextrq, /// Extract word vpextrw, + /// Insert Byte + vpinsrb, + /// Insert Doubleword + vpinsrd, + /// Insert Quadword + vpinsrq, /// Insert word vpinsrw, /// Shuffle packed high words @@ -430,6 +466,14 @@ pub const Inst = struct { vsubsd, /// Subtract scalar single-precision floating-point values vsubss, + /// Unpack and interleave high packed double-precision floating-point values + vunpckhpd, + /// Unpack and interleave high packed single-precision floating-point values + vunpckhps, + /// Unpack and interleave low packed double-precision floating-point values + vunpcklpd, + /// Unpack and interleave low packed single-precision floating-point values + vunpcklps, /// Convert 16-bit floating-point values to single-precision floating-point values vcvtph2ps, diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index c41f0ea4e7..2b9d530c1e 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -865,6 +865,8 @@ pub const table = [_]Entry{ .{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse }, .{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .none, .sse }, + .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse }, + .{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse }, .{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse }, @@ -988,8 +990,16 @@ pub const table = [_]Entry{ .{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 }, // SSE4.1 + .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 }, + .{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 }, + .{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 }, + .{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 }, + .{ .pinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 }, + .{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 }, + .{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 }, + .{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 }, .{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 }, @@ -1062,6 +1072,8 @@ pub const table = [_]Entry{ .{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, .{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx }, + .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx }, + .{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx }, .{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx }, @@ -1098,9 +1110,17 @@ pub const table = [_]Entry{ .{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx }, + .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx }, + .{ .vpextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx }, + .{ .vpextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx }, + .{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx }, .{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx }, + .{ .vpinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx }, + .{ .vpinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx }, + .{ .vpinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx }, + .{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx }, .{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx }, |
