aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJacob Young <jacobly0@users.noreply.github.com>2023-05-07 20:42:46 -0400
committerJacob Young <jacobly0@users.noreply.github.com>2023-05-08 07:36:20 -0400
commit6778da4516e68c271cb50fe9c252ab4084daf16b (patch)
tree46e3bb0082569ecb678039e36fa2a422ae02a994 /src
parentf8708e2c4d93eece5b3e131fd2d1b5b210806cd6 (diff)
downloadzig-6778da4516e68c271cb50fe9c252ab4084daf16b.tar.gz
zig-6778da4516e68c271cb50fe9c252ab4084daf16b.zip
x86_64: implement binary operations for `f16` and `f16` vectors
Diffstat (limited to 'src')
-rw-r--r--src/arch/x86_64/CodeGen.zig261
-rw-r--r--src/arch/x86_64/Encoding.zig23
-rw-r--r--src/arch/x86_64/Lower.zig22
-rw-r--r--src/arch/x86_64/Mir.zig44
-rw-r--r--src/arch/x86_64/encodings.zig20
5 files changed, 339 insertions, 31 deletions
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 3e2d418105..154b909a21 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -4497,14 +4497,15 @@ fn airFloatSign(self: *Self, inst: Air.Inst.Index) !void {
const tag = self.air.instructions.items(.tag)[inst];
try self.genBinOpMir(switch (ty_bits) {
// No point using an extra prefix byte for *pd which performs the same operation.
- 32, 64 => switch (tag) {
+ 16, 32, 64, 128 => switch (tag) {
.neg => .xorps,
.fabs => .andnps,
else => unreachable,
},
- else => return self.fail("TODO implement airFloatSign for {}", .{
+ 80 => return self.fail("TODO implement airFloatSign for {}", .{
ty.fmt(self.bin_file.options.module.?),
}),
+ else => unreachable,
}, vec_ty, dst_mcv, sign_mcv);
return self.finishAir(inst, dst_mcv, .{ un_op, .none, .none });
}
@@ -6112,9 +6113,53 @@ fn genBinOp(
return dst_mcv;
}
+ const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
const mir_tag = if (@as(?Mir.Inst.Tag, switch (lhs_ty.zigTypeTag()) {
else => unreachable,
.Float => switch (lhs_ty.floatBits(self.target.*)) {
+ 16 => if (self.hasFeature(.f16c)) {
+ const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+ const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+ defer self.register_manager.unlockReg(tmp_lock);
+
+ if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+ .vpinsrw,
+ dst_reg,
+ dst_reg,
+ src_mcv.mem(.word),
+ Immediate.u(1),
+ ) else try self.asmRegisterRegisterRegister(
+ .vpunpcklwd,
+ dst_reg,
+ dst_reg,
+ (if (src_mcv.isRegister())
+ src_mcv.getReg().?
+ else
+ try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+ );
+ try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+ try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg);
+ try self.asmRegisterRegisterRegister(
+ switch (air_tag) {
+ .add => .vaddss,
+ .sub => .vsubss,
+ .div_float, .div_trunc, .div_floor, .div_exact => .vdivss,
+ .max => .vmaxss,
+ .min => .vmaxss,
+ else => unreachable,
+ },
+ dst_reg,
+ dst_reg,
+ tmp_reg,
+ );
+ try self.asmRegisterRegisterImmediate(
+ .vcvtps2ph,
+ dst_reg,
+ dst_reg,
+ Immediate.u(0b1_00),
+ );
+ return dst_mcv;
+ } else null,
32 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddss else .addss,
.sub => if (self.hasFeature(.avx)) .vsubss else .subss,
@@ -6141,12 +6186,178 @@ fn genBinOp(
.min => if (self.hasFeature(.avx)) .vminsd else .minsd,
else => unreachable,
},
- 16, 80, 128 => null,
+ 80, 128 => null,
else => unreachable,
},
.Vector => switch (lhs_ty.childType().zigTypeTag()) {
else => null,
.Float => switch (lhs_ty.childType().floatBits(self.target.*)) {
+ 16 => if (self.hasFeature(.f16c)) switch (lhs_ty.vectorLen()) {
+ 1 => {
+ const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+ const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+ defer self.register_manager.unlockReg(tmp_lock);
+
+ if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+ .vpinsrw,
+ dst_reg,
+ dst_reg,
+ src_mcv.mem(.word),
+ Immediate.u(1),
+ ) else try self.asmRegisterRegisterRegister(
+ .vpunpcklwd,
+ dst_reg,
+ dst_reg,
+ (if (src_mcv.isRegister())
+ src_mcv.getReg().?
+ else
+ try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+ );
+ try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+ try self.asmRegisterRegister(.vmovshdup, tmp_reg, dst_reg);
+ try self.asmRegisterRegisterRegister(
+ switch (air_tag) {
+ .add => .vaddss,
+ .sub => .vsubss,
+ .div_float, .div_trunc, .div_floor, .div_exact => .vdivss,
+ .max => .vmaxss,
+ .min => .vmaxss,
+ else => unreachable,
+ },
+ dst_reg,
+ dst_reg,
+ tmp_reg,
+ );
+ try self.asmRegisterRegisterImmediate(
+ .vcvtps2ph,
+ dst_reg,
+ dst_reg,
+ Immediate.u(0b1_00),
+ );
+ return dst_mcv;
+ },
+ 2 => {
+ const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+ const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+ defer self.register_manager.unlockReg(tmp_lock);
+
+ if (src_mcv.isMemory()) try self.asmRegisterMemoryImmediate(
+ .vpinsrd,
+ dst_reg,
+ src_mcv.mem(.dword),
+ Immediate.u(1),
+ ) else try self.asmRegisterRegisterRegister(
+ .vunpcklps,
+ dst_reg,
+ dst_reg,
+ (if (src_mcv.isRegister())
+ src_mcv.getReg().?
+ else
+ try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+ );
+ try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+ try self.asmRegisterRegisterRegister(.vmovhlps, tmp_reg, dst_reg, dst_reg);
+ try self.asmRegisterRegisterRegister(
+ switch (air_tag) {
+ .add => .vaddps,
+ .sub => .vsubps,
+ .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+ .max => .vmaxps,
+ .min => .vmaxps,
+ else => unreachable,
+ },
+ dst_reg,
+ dst_reg,
+ tmp_reg,
+ );
+ try self.asmRegisterRegisterImmediate(
+ .vcvtps2ph,
+ dst_reg,
+ dst_reg,
+ Immediate.u(0b1_00),
+ );
+ return dst_mcv;
+ },
+ 3...4 => {
+ const tmp_reg = (try self.register_manager.allocReg(null, sse)).to128();
+ const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+ defer self.register_manager.unlockReg(tmp_lock);
+
+ try self.asmRegisterRegister(.vcvtph2ps, dst_reg, dst_reg);
+ if (src_mcv.isMemory()) try self.asmRegisterMemory(
+ .vcvtph2ps,
+ tmp_reg,
+ src_mcv.mem(.qword),
+ ) else try self.asmRegisterRegister(
+ .vcvtph2ps,
+ tmp_reg,
+ (if (src_mcv.isRegister())
+ src_mcv.getReg().?
+ else
+ try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+ );
+ try self.asmRegisterRegisterRegister(
+ switch (air_tag) {
+ .add => .vaddps,
+ .sub => .vsubps,
+ .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+ .max => .vmaxps,
+ .min => .vmaxps,
+ else => unreachable,
+ },
+ dst_reg,
+ dst_reg,
+ tmp_reg,
+ );
+ try self.asmRegisterRegisterImmediate(
+ .vcvtps2ph,
+ dst_reg,
+ dst_reg,
+ Immediate.u(0b1_00),
+ );
+ return dst_mcv;
+ },
+ 5...8 => {
+ const tmp_reg = (try self.register_manager.allocReg(null, sse)).to256();
+ const tmp_lock = self.register_manager.lockRegAssumeUnused(tmp_reg);
+ defer self.register_manager.unlockReg(tmp_lock);
+
+ try self.asmRegisterRegister(.vcvtph2ps, dst_reg.to256(), dst_reg);
+ if (src_mcv.isMemory()) try self.asmRegisterMemory(
+ .vcvtph2ps,
+ tmp_reg,
+ src_mcv.mem(.xword),
+ ) else try self.asmRegisterRegister(
+ .vcvtph2ps,
+ tmp_reg,
+ (if (src_mcv.isRegister())
+ src_mcv.getReg().?
+ else
+ try self.copyToTmpRegister(rhs_ty, src_mcv)).to128(),
+ );
+ try self.asmRegisterRegisterRegister(
+ switch (air_tag) {
+ .add => .vaddps,
+ .sub => .vsubps,
+ .div_float, .div_trunc, .div_floor, .div_exact => .vdivps,
+ .max => .vmaxps,
+ .min => .vmaxps,
+ else => unreachable,
+ },
+ dst_reg.to256(),
+ dst_reg.to256(),
+ tmp_reg,
+ );
+ try self.asmRegisterRegisterImmediate(
+ .vcvtps2ph,
+ dst_reg,
+ dst_reg.to256(),
+ Immediate.u(0b1_00),
+ );
+ return dst_mcv;
+ },
+ else => null,
+ } else null,
32 => switch (lhs_ty.vectorLen()) {
1 => switch (air_tag) {
.add => if (self.hasFeature(.avx)) .vaddss else .addss,
@@ -6223,14 +6434,13 @@ fn genBinOp(
} else null,
else => null,
},
- 16, 80, 128 => null,
+ 80, 128 => null,
else => unreachable,
},
},
})) |tag| tag else return self.fail("TODO implement genBinOp for {s} {}", .{
@tagName(air_tag), lhs_ty.fmt(self.bin_file.options.module.?),
});
- const dst_reg = registerAlias(dst_mcv.getReg().?, abi_size);
if (self.hasFeature(.avx)) {
const src1_alias =
if (copied_to_dst) dst_reg else registerAlias(lhs_mcv.getReg().?, abi_size);
@@ -7139,21 +7349,21 @@ fn airCmp(self: *Self, inst: Air.Inst.Index, op: math.CompareOperator) !void {
const tmp2_lock = self.register_manager.lockRegAssumeUnused(tmp2_reg);
defer self.register_manager.unlockReg(tmp2_lock);
- if (src_mcv.isRegister())
- try self.asmRegisterRegisterRegister(
- .vpunpcklwd,
- tmp1_reg,
- dst_reg.to128(),
- src_mcv.getReg().?.to128(),
- )
- else
- try self.asmRegisterRegisterMemoryImmediate(
- .vpinsrw,
- tmp1_reg,
- dst_reg.to128(),
- src_mcv.mem(.word),
- Immediate.u(1),
- );
+ if (src_mcv.isMemory()) try self.asmRegisterRegisterMemoryImmediate(
+ .vpinsrw,
+ tmp1_reg,
+ dst_reg.to128(),
+ src_mcv.mem(.word),
+ Immediate.u(1),
+ ) else try self.asmRegisterRegisterRegister(
+ .vpunpcklwd,
+ tmp1_reg,
+ dst_reg.to128(),
+ (if (src_mcv.isRegister())
+ src_mcv.getReg().?
+ else
+ try self.copyToTmpRegister(ty, src_mcv)).to128(),
+ );
try self.asmRegisterRegister(.vcvtph2ps, tmp1_reg, tmp1_reg);
try self.asmRegisterRegister(.vmovshdup, tmp2_reg, tmp1_reg);
try self.genBinOpMir(.ucomiss, ty, tmp1_mcv, tmp2_mcv);
@@ -8139,7 +8349,16 @@ fn movMirTag(self: *Self, ty: Type, aligned: bool) !Mir.Inst.Tag {
},
.Vector => switch (ty.childType().zigTypeTag()) {
.Float => switch (ty.childType().floatBits(self.target.*)) {
- 16 => unreachable, // needs special handling
+ 16 => switch (ty.vectorLen()) {
+ 1 => unreachable, // needs special handling
+ 2 => return if (self.hasFeature(.avx)) .vmovss else .movss,
+ 3...4 => return if (self.hasFeature(.avx)) .vmovsd else .movsd,
+ 5...8 => return if (self.hasFeature(.avx))
+ if (aligned) .vmovaps else .vmovups
+ else if (aligned) .movaps else .movups,
+ 9...16 => if (self.hasFeature(.avx)) return if (aligned) .vmovaps else .vmovups,
+ else => {},
+ },
32 => switch (ty.vectorLen()) {
1 => return if (self.hasFeature(.avx)) .vmovss else .movss,
2...4 => return if (self.hasFeature(.avx))
diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig
index b8ccc9efba..3235b29358 100644
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@@ -270,7 +270,7 @@ pub const Mnemonic = enum {
divps, divss,
maxps, maxss,
minps, minss,
- movaps, movss, movups,
+ movaps, movhlps, movss, movups,
mulps, mulss,
orps,
pextrw, pinsrw,
@@ -303,6 +303,8 @@ pub const Mnemonic = enum {
// SSE3
movddup, movshdup, movsldup,
// SSE4.1
+ pextrb, pextrd, pextrq,
+ pinsrb, pinsrd, pinsrq,
roundpd, roundps, roundsd, roundss,
// AVX
vaddpd, vaddps, vaddsd, vaddss,
@@ -311,13 +313,14 @@ pub const Mnemonic = enum {
vmaxpd, vmaxps, vmaxsd, vmaxss,
vminpd, vminps, vminsd, vminss,
vmovapd, vmovaps,
- vmovddup,
+ vmovddup, vmovhlps,
vmovsd,
vmovshdup, vmovsldup,
vmovss,
vmovupd, vmovups,
vmulpd, vmulps, vmulsd, vmulss,
- vpextrw, vpinsrw,
+ vpextrb, vpextrd, vpextrq, vpextrw,
+ vpinsrb, vpinsrd, vpinsrq, vpinsrw,
vpshufhw, vpshuflw,
vpsrld, vpsrlq, vpsrlw,
vpunpckhbw, vpunpckhdq, vpunpckhqdq, vpunpckhwd,
@@ -359,7 +362,7 @@ pub const Op = enum {
cl,
r8, r16, r32, r64,
rm8, rm16, rm32, rm64,
- r32_m16, r64_m16,
+ r32_m8, r32_m16, r64_m16,
m8, m16, m32, m64, m80, m128, m256,
rel8, rel16, rel32,
m,
@@ -444,7 +447,7 @@ pub const Op = enum {
pub fn immBitSize(op: Op) u64 {
return switch (op) {
.none, .o16, .o32, .o64, .moffs, .m, .sreg => unreachable,
- .al, .cl, .r8, .rm8 => unreachable,
+ .al, .cl, .r8, .rm8, .r32_m8 => unreachable,
.ax, .r16, .rm16 => unreachable,
.eax, .r32, .rm32, .r32_m16 => unreachable,
.rax, .r64, .rm64, .r64_m16 => unreachable,
@@ -467,7 +470,7 @@ pub const Op = enum {
.m8, .m16, .m32, .m64, .m80, .m128, .m256 => unreachable,
.al, .cl, .r8, .rm8 => 8,
.ax, .r16, .rm16 => 16,
- .eax, .r32, .rm32, .r32_m16 => 32,
+ .eax, .r32, .rm32, .r32_m8, .r32_m16 => 32,
.rax, .r64, .rm64, .r64_m16 => 64,
.xmm, .xmm_m32, .xmm_m64, .xmm_m128 => 128,
.ymm, .ymm_m256 => 256,
@@ -480,7 +483,7 @@ pub const Op = enum {
.unity, .imm8, .imm8s, .imm16, .imm16s, .imm32, .imm32s, .imm64 => unreachable,
.rel8, .rel16, .rel32 => unreachable,
.al, .cl, .r8, .ax, .r16, .eax, .r32, .rax, .r64, .xmm, .ymm => unreachable,
- .m8, .rm8 => 8,
+ .m8, .rm8, .r32_m8 => 8,
.m16, .rm16, .r32_m16, .r64_m16 => 16,
.m32, .rm32, .xmm_m32 => 32,
.m64, .rm64, .xmm_m64 => 64,
@@ -509,7 +512,7 @@ pub const Op = enum {
.al, .ax, .eax, .rax,
.r8, .r16, .r32, .r64,
.rm8, .rm16, .rm32, .rm64,
- .r32_m16, .r64_m16,
+ .r32_m8, .r32_m16, .r64_m16,
.xmm, .xmm_m32, .xmm_m64, .xmm_m128,
.ymm, .ymm_m256,
=> true,
@@ -535,7 +538,7 @@ pub const Op = enum {
// zig fmt: off
return switch (op) {
.rm8, .rm16, .rm32, .rm64,
- .r32_m16, .r64_m16,
+ .r32_m8, .r32_m16, .r64_m16,
.m8, .m16, .m32, .m64, .m80, .m128, .m256,
.m,
.xmm_m32, .xmm_m64, .xmm_m128,
@@ -559,7 +562,7 @@ pub const Op = enum {
.al, .ax, .eax, .rax, .cl => .general_purpose,
.r8, .r16, .r32, .r64 => .general_purpose,
.rm8, .rm16, .rm32, .rm64 => .general_purpose,
- .r32_m16, .r64_m16 => .general_purpose,
+ .r32_m8, .r32_m16, .r64_m16 => .general_purpose,
.sreg => .segment,
.xmm, .xmm_m32, .xmm_m64, .xmm_m128 => .floating_point,
.ymm, .ymm_m256 => .floating_point,
diff --git a/src/arch/x86_64/Lower.zig b/src/arch/x86_64/Lower.zig
index 2cfa25ac84..5c079f4768 100644
--- a/src/arch/x86_64/Lower.zig
+++ b/src/arch/x86_64/Lower.zig
@@ -137,6 +137,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.minps,
.minss,
.movaps,
+ .movhlps,
.movss,
.movups,
.mulps,
@@ -149,6 +150,8 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.subps,
.subss,
.ucomiss,
+ .unpckhps,
+ .unpcklps,
.xorps,
.addpd,
@@ -187,12 +190,20 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.subpd,
.subsd,
.ucomisd,
+ .unpckhpd,
+ .unpcklpd,
.xorpd,
.movddup,
.movshdup,
.movsldup,
+ .pextrb,
+ .pextrd,
+ .pextrq,
+ .pinsrb,
+ .pinsrd,
+ .pinsrq,
.roundpd,
.roundps,
.roundsd,
@@ -221,6 +232,7 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.vmovapd,
.vmovaps,
.vmovddup,
+ .vmovhlps,
.vmovsd,
.vmovshdup,
.vmovsldup,
@@ -231,7 +243,13 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.vmulps,
.vmulsd,
.vmulss,
+ .vpextrb,
+ .vpextrd,
+ .vpextrq,
.vpextrw,
+ .vpinsrb,
+ .vpinsrd,
+ .vpinsrq,
.vpinsrw,
.vpshufhw,
.vpshuflw,
@@ -258,6 +276,10 @@ pub fn lowerMir(lower: *Lower, index: Mir.Inst.Index) Error!struct {
.vsubps,
.vsubsd,
.vsubss,
+ .vunpckhpd,
+ .vunpckhps,
+ .vunpcklpd,
+ .vunpcklps,
.vcvtph2ps,
.vcvtps2ph,
diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig
index c0450406cf..442cfabebb 100644
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@@ -192,6 +192,8 @@ pub const Inst = struct {
minss,
/// Move aligned packed single-precision floating-point values
movaps,
+ /// Move packed single-precision floating-point values high to low
+ movhlps,
/// Move scalar single-precision floating-point value
movss,
/// Move unaligned packed single-precision floating-point values
@@ -216,6 +218,10 @@ pub const Inst = struct {
subss,
/// Unordered compare scalar single-precision floating-point values
ucomiss,
+ /// Unpack and interleave high packed single-precision floating-point values
+ unpckhps,
+ /// Unpack and interleave low packed single-precision floating-point values
+ unpcklps,
/// Bitwise logical xor of packed single precision floating-point values
xorps,
@@ -291,6 +297,10 @@ pub const Inst = struct {
subsd,
/// Unordered compare scalar double-precision floating-point values
ucomisd,
+ /// Unpack and interleave high packed double-precision floating-point values
+ unpckhpd,
+ /// Unpack and interleave low packed double-precision floating-point values
+ unpcklpd,
/// Bitwise logical xor of packed double precision floating-point values
xorpd,
@@ -301,6 +311,18 @@ pub const Inst = struct {
/// Replicate single floating-point values
movsldup,
+ /// Extract Byte
+ pextrb,
+ /// Extract Doubleword
+ pextrd,
+ /// Extract Quadword
+ pextrq,
+ /// Insert Byte
+ pinsrb,
+ /// Insert Doubleword
+ pinsrd,
+ /// Insert Quadword
+ pinsrq,
/// Round packed double-precision floating-point values
roundpd,
/// Round packed single-precision floating-point values
@@ -354,6 +376,8 @@ pub const Inst = struct {
vmovapd,
/// Move aligned packed single-precision floating-point values
vmovaps,
+ /// Move packed single-precision floating-point values high to low
+ vmovhlps,
/// Replicate double floating-point values
vmovddup,
/// Move or merge scalar double-precision floating-point value
@@ -376,8 +400,20 @@ pub const Inst = struct {
vmulsd,
/// Multiply scalar single-precision floating-point values
vmulss,
+ /// Extract Byte
+ vpextrb,
+ /// Extract Doubleword
+ vpextrd,
+ /// Extract Quadword
+ vpextrq,
/// Extract word
vpextrw,
+ /// Insert Byte
+ vpinsrb,
+ /// Insert Doubleword
+ vpinsrd,
+ /// Insert Quadword
+ vpinsrq,
/// Insert word
vpinsrw,
/// Shuffle packed high words
@@ -430,6 +466,14 @@ pub const Inst = struct {
vsubsd,
/// Subtract scalar single-precision floating-point values
vsubss,
+ /// Unpack and interleave high packed double-precision floating-point values
+ vunpckhpd,
+ /// Unpack and interleave high packed single-precision floating-point values
+ vunpckhps,
+ /// Unpack and interleave low packed double-precision floating-point values
+ vunpcklpd,
+ /// Unpack and interleave low packed single-precision floating-point values
+ vunpcklps,
/// Convert 16-bit floating-point values to single-precision floating-point values
vcvtph2ps,
diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig
index c41f0ea4e7..2b9d530c1e 100644
--- a/src/arch/x86_64/encodings.zig
+++ b/src/arch/x86_64/encodings.zig
@@ -865,6 +865,8 @@ pub const table = [_]Entry{
.{ .movaps, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x0f, 0x28 }, 0, .none, .sse },
.{ .movaps, .mr, &.{ .xmm_m128, .xmm }, &.{ 0x0f, 0x29 }, 0, .none, .sse },
+ .{ .movhlps, .rm, &.{ .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .none, .sse },
+
.{ .movss, .rm, &.{ .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x10 }, 0, .none, .sse },
.{ .movss, .mr, &.{ .xmm_m32, .xmm }, &.{ 0xf3, 0x0f, 0x11 }, 0, .none, .sse },
@@ -988,8 +990,16 @@ pub const table = [_]Entry{
.{ .movsldup, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0xf3, 0x0f, 0x12 }, 0, .none, .sse3 },
// SSE4.1
+ .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 },
+ .{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 },
+ .{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 },
+
.{ .pextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .none, .sse4_1 },
+ .{ .pinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .none, .sse4_1 },
+ .{ .pinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .none, .sse4_1 },
+ .{ .pinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .long, .sse4_1 },
+
.{ .roundpd, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x09 }, 0, .none, .sse4_1 },
.{ .roundps, .rmi, &.{ .xmm, .xmm_m128, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x08 }, 0, .none, .sse4_1 },
@@ -1062,6 +1072,8 @@ pub const table = [_]Entry{
.{ .vmovddup, .rm, &.{ .xmm, .xmm_m64 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
.{ .vmovddup, .rm, &.{ .ymm, .ymm_m256 }, &.{ 0xf2, 0x0f, 0x12 }, 0, .vex_256_wig, .avx },
+ .{ .vmovhlps, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0x0f, 0x12 }, 0, .vex_128_wig, .avx },
+
.{ .vmovsd, .rvm, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
.{ .vmovsd, .rm, &.{ .xmm, .m64 }, &.{ 0xf2, 0x0f, 0x10 }, 0, .vex_lig_wig, .avx },
.{ .vmovsd, .mvr, &.{ .xmm, .xmm, .xmm }, &.{ 0xf2, 0x0f, 0x11 }, 0, .vex_lig_wig, .avx },
@@ -1098,9 +1110,17 @@ pub const table = [_]Entry{
.{ .vmulss, .rvm, &.{ .xmm, .xmm, .xmm_m32 }, &.{ 0xf3, 0x0f, 0x59 }, 0, .vex_lig_wig, .avx },
+ .{ .vpextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .vex_128_w0, .avx },
+ .{ .vpextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w0, .avx },
+ .{ .vpextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .vex_128_w1, .avx },
+
.{ .vpextrw, .rmi, &.{ .r32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x15 }, 0, .vex_128_wig, .avx },
.{ .vpextrw, .mri, &.{ .r32_m16, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x15 }, 0, .vex_128_wig, .avx },
+ .{ .vpinsrb, .rmi, &.{ .xmm, .r32_m8, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x20 }, 0, .vex_128_w0, .avx },
+ .{ .vpinsrd, .rmi, &.{ .xmm, .rm32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w0, .avx },
+ .{ .vpinsrq, .rmi, &.{ .xmm, .rm64, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x22 }, 0, .vex_128_w1, .avx },
+
.{ .vpinsrw, .rvmi, &.{ .xmm, .xmm, .r32_m16, .imm8 }, &.{ 0x66, 0x0f, 0xc4 }, 0, .vex_128_wig, .avx },
.{ .vpsrlw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xd1 }, 0, .vex_128_wig, .avx },