diff options
| author | Jacob Young <jacobly0@users.noreply.github.com> | 2023-05-17 00:23:11 -0400 |
|---|---|---|
| committer | Jacob Young <jacobly0@users.noreply.github.com> | 2023-05-18 20:42:38 -0400 |
| commit | 35da95fe8765874a1ccffb0d7bfd523b14f44a4a (patch) | |
| tree | 7a4bbc7644c11e42d3b742d04df133835fae32e3 /src | |
| parent | 28c445addde840eec49ed0fd19cc19384a040085 (diff) | |
| download | zig-35da95fe8765874a1ccffb0d7bfd523b14f44a4a.tar.gz zig-35da95fe8765874a1ccffb0d7bfd523b14f44a4a.zip | |
x86_64: implement integer vector `@truncate`
Diffstat (limited to 'src')
| -rw-r--r-- | src/arch/x86_64/CodeGen.zig | 124 | ||||
| -rw-r--r-- | src/arch/x86_64/Encoding.zig | 3 | ||||
| -rw-r--r-- | src/arch/x86_64/Mir.zig | 8 | ||||
| -rw-r--r-- | src/arch/x86_64/encodings.zig | 21 |
4 files changed, 136 insertions, 20 deletions
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index bdcbed2629..a258f732f0 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2709,28 +2709,112 @@ fn airTrunc(self: *Self, inst: Air.Inst.Index) !void { const ty_op = self.air.instructions.items(.data)[inst].ty_op; const dst_ty = self.air.typeOfIndex(inst); - const dst_abi_size = dst_ty.abiSize(self.target.*); - if (dst_abi_size > 8) { - return self.fail("TODO implement trunc for abi sizes larger than 8", .{}); - } + const dst_abi_size = @intCast(u32, dst_ty.abiSize(self.target.*)); + const src_ty = self.air.typeOf(ty_op.operand); + const src_abi_size = @intCast(u32, src_ty.abiSize(self.target.*)); - const src_mcv = try self.resolveInst(ty_op.operand); - const src_lock = switch (src_mcv) { - .register => |reg| self.register_manager.lockRegAssumeUnused(reg), - else => null, - }; - defer if (src_lock) |lock| self.register_manager.unlockReg(lock); + const result = result: { + const src_mcv = try self.resolveInst(ty_op.operand); + const src_lock = + if (src_mcv.getReg()) |reg| self.register_manager.lockRegAssumeUnused(reg) else null; + defer if (src_lock) |lock| self.register_manager.unlockReg(lock); - const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) - src_mcv - else - try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); + const dst_mcv = if (src_mcv.isRegister() and self.reuseOperand(inst, ty_op.operand, 0, src_mcv)) + src_mcv + else + try self.copyToRegisterWithInstTracking(inst, dst_ty, src_mcv); + + if (dst_ty.zigTypeTag() == .Vector) { + assert(src_ty.zigTypeTag() == .Vector and dst_ty.vectorLen() == src_ty.vectorLen()); + const dst_info = dst_ty.childType().intInfo(self.target.*); + const src_info = src_ty.childType().intInfo(self.target.*); + const mir_tag = if (@as(?Mir.Inst.FixedTag, switch (dst_info.bits) { + 8 => switch (src_info.bits) { + 16 => switch (dst_ty.vectorLen()) { + 1...8 => if (self.hasFeature(.avx)) .{ .vp_b, .ackusw } else .{ .p_b, .ackusw }, + 9...16 => if (self.hasFeature(.avx2)) .{ .vp_b, .ackusw } else null, + else => null, + }, + else => null, + }, + 16 => switch (src_info.bits) { + 32 => switch (dst_ty.vectorLen()) { + 1...4 => if (self.hasFeature(.avx)) + .{ .vp_w, .ackusd } + else if (self.hasFeature(.sse4_1)) + .{ .p_w, .ackusd } + else + null, + 5...8 => if (self.hasFeature(.avx2)) .{ .vp_w, .ackusd } else null, + else => null, + }, + else => null, + }, + else => null, + })) |tag| tag else return self.fail("TODO implement airTrunc for {}", .{ + dst_ty.fmt(self.bin_file.options.module.?), + }); - // when truncating a `u16` to `u5`, for example, those top 3 bits in the result - // have to be removed. this only happens if the dst if not a power-of-two size. - if (self.regExtraBits(dst_ty) > 0) try self.truncateRegister(dst_ty, dst_mcv.register.to64()); + var mask_pl = Value.Payload.U64{ + .base = .{ .tag = .int_u64 }, + .data = @as(u64, math.maxInt(u64)) >> @intCast(u6, 64 - dst_info.bits), + }; + const mask_val = Value.initPayload(&mask_pl.base); - return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); + var splat_pl = Value.Payload.SubValue{ + .base = .{ .tag = .repeated }, + .data = mask_val, + }; + const splat_val = Value.initPayload(&splat_pl.base); + + var full_pl = Type.Payload.Array{ + .base = .{ .tag = .vector }, + .data = .{ + .len = @divExact(@as(u64, if (src_abi_size > 16) 256 else 128), src_info.bits), + .elem_type = src_ty.childType(), + }, + }; + const full_ty = Type.initPayload(&full_pl.base); + const full_abi_size = @intCast(u32, full_ty.abiSize(self.target.*)); + + const splat_mcv = try self.genTypedValue(.{ .ty = full_ty, .val = splat_val }); + const splat_addr_mcv: MCValue = switch (splat_mcv) { + .memory, .indirect, .load_frame => splat_mcv.address(), + else => .{ .register = try self.copyToTmpRegister(Type.usize, splat_mcv.address()) }, + }; + + const dst_reg = registerAlias(dst_mcv.getReg().?, src_abi_size); + if (self.hasFeature(.avx)) { + try self.asmRegisterRegisterMemory( + .{ .vp_, .@"and" }, + dst_reg, + dst_reg, + splat_addr_mcv.deref().mem(Memory.PtrSize.fromSize(full_abi_size)), + ); + try self.asmRegisterRegisterRegister(mir_tag, dst_reg, dst_reg, dst_reg); + } else { + try self.asmRegisterMemory( + .{ .p_, .@"and" }, + dst_reg, + splat_addr_mcv.deref().mem(Memory.PtrSize.fromSize(full_abi_size)), + ); + try self.asmRegisterRegister(mir_tag, dst_reg, dst_reg); + } + break :result dst_mcv; + } + + if (dst_abi_size > 8) { + return self.fail("TODO implement trunc for abi sizes larger than 8", .{}); + } + + // when truncating a `u16` to `u5`, for example, those top 3 bits in the result + // have to be removed. this only happens if the dst if not a power-of-two size. + if (self.regExtraBits(dst_ty) > 0) + try self.truncateRegister(dst_ty, dst_mcv.register.to64()); + + break :result dst_mcv; + }; + return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } fn airBoolToInt(self: *Self, inst: Air.Inst.Index) !void { @@ -11081,8 +11165,8 @@ fn airSelect(self: *Self, inst: Air.Inst.Index) !void { } fn airShuffle(self: *Self, inst: Air.Inst.Index) !void { - const ty_op = self.air.instructions.items(.data)[inst].ty_op; - _ = ty_op; + const ty_pl = self.air.instructions.items(.data)[inst].ty_pl; + _ = ty_pl; return self.fail("TODO implement airShuffle for x86_64", .{}); //return self.finishAir(inst, result, .{ ty_op.operand, .none, .none }); } diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index 0aaf12013d..6ed0aeeff4 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -263,6 +263,7 @@ pub const Mnemonic = enum { fisttp, fld, // MMX movd, movq, + packssdw, packsswb, packuswb, paddb, paddd, paddq, paddsb, paddsw, paddusb, paddusw, paddw, pand, pandn, por, pxor, pmulhw, pmullw, @@ -319,6 +320,7 @@ pub const Mnemonic = enum { blendpd, blendps, blendvpd, blendvps, extractps, insertps, + packusdw, pextrb, pextrd, pextrq, pinsrb, pinsrd, pinsrq, pmaxsb, pmaxsd, pmaxud, pmaxuw, pminsb, pminsd, pminud, pminuw, @@ -351,6 +353,7 @@ pub const Mnemonic = enum { vmovupd, vmovups, vmulpd, vmulps, vmulsd, vmulss, vorpd, vorps, + vpackssdw, vpacksswb, vpackusdw, vpackuswb, vpaddb, vpaddd, vpaddq, vpaddsb, vpaddsw, vpaddusb, vpaddusw, vpaddw, vpand, vpandn, vpextrb, vpextrd, vpextrq, vpextrw, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index 9f59a2afba..96b7742929 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -446,6 +446,12 @@ pub const Inst = struct { /// Bitwise logical xor of packed double-precision floating-point values xor, + /// Pack with signed saturation + ackssw, + /// Pack with signed saturation + ackssd, + /// Pack with unsigned saturation + ackusw, /// Add packed signed integers with signed saturation adds, /// Add packed unsigned integers with unsigned saturation @@ -596,6 +602,8 @@ pub const Inst = struct { /// Replicate single floating-point values movsldup, + /// Pack with unsigned saturation + ackusd, /// Blend packed single-precision floating-point values /// Blend scalar single-precision floating-point values /// Blend packed double-precision floating-point values diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index e087f6dfc7..a0cd1af0a7 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -996,6 +996,11 @@ pub const table = [_]Entry{ .{ .orpd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x56 }, 0, .none, .sse2 }, + .{ .packsswb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x63 }, 0, .none, .sse2 }, + .{ .packssdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6b }, 0, .none, .sse2 }, + + .{ .packuswb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x67 }, 0, .none, .sse2 }, + .{ .paddb, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .none, .sse2 }, .{ .paddw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .none, .sse2 }, .{ .paddd, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .none, .sse2 }, @@ -1101,6 +1106,8 @@ pub const table = [_]Entry{ .{ .insertps, .rmi, &.{ .xmm, .xmm_m32, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x21 }, 0, .none, .sse4_1 }, + .{ .packusdw, .rm, &.{ .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .none, .sse4_1 }, + .{ .pextrb, .mri, &.{ .r32_m8, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x14 }, 0, .none, .sse4_1 }, .{ .pextrd, .mri, &.{ .rm32, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .none, .sse4_1 }, .{ .pextrq, .mri, &.{ .rm64, .xmm, .imm8 }, &.{ 0x66, 0x0f, 0x3a, 0x16 }, 0, .long, .sse4_1 }, @@ -1346,6 +1353,13 @@ pub const table = [_]Entry{ .{ .vorps, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x0f, 0x56 }, 0, .vex_128_wig, .avx }, .{ .vorps, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x0f, 0x56 }, 0, .vex_256_wig, .avx }, + .{ .vpacksswb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x63 }, 0, .vex_128_wig, .avx }, + .{ .vpackssdw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x6b }, 0, .vex_128_wig, .avx }, + + .{ .vpackusdw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .vex_128_wig, .avx }, + + .{ .vpackuswb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0x67 }, 0, .vex_128_wig, .avx }, + .{ .vpaddb, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_128_wig, .avx }, .{ .vpaddw, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_128_wig, .avx }, .{ .vpaddd, .rvm, &.{ .xmm, .xmm, .xmm_m128 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_128_wig, .avx }, @@ -1508,6 +1522,13 @@ pub const table = [_]Entry{ .{ .vbroadcastss, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x18 }, 0, .vex_256_w0, .avx2 }, .{ .vbroadcastsd, .rm, &.{ .ymm, .xmm }, &.{ 0x66, 0x0f, 0x38, 0x19 }, 0, .vex_256_w0, .avx2 }, + .{ .vpacksswb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x63 }, 0, .vex_256_wig, .avx2 }, + .{ .vpackssdw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x6b }, 0, .vex_256_wig, .avx2 }, + + .{ .vpackusdw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x38, 0x2b }, 0, .vex_256_wig, .avx2 }, + + .{ .vpackuswb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0x67 }, 0, .vex_256_wig, .avx2 }, + .{ .vpaddb, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfc }, 0, .vex_256_wig, .avx2 }, .{ .vpaddw, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfd }, 0, .vex_256_wig, .avx2 }, .{ .vpaddd, .rvm, &.{ .ymm, .ymm, .ymm_m256 }, &.{ 0x66, 0x0f, 0xfe }, 0, .vex_256_wig, .avx2 }, |
