diff options
| author | Andrew Kelley <superjoe30@gmail.com> | 2017-10-23 21:43:18 -0400 |
|---|---|---|
| committer | Andrew Kelley <superjoe30@gmail.com> | 2017-11-02 21:54:24 -0400 |
| commit | 94ec2190f8d8c41d19b668511bf31fae32bcd095 (patch) | |
| tree | c5ab9b20fbaf8f017661f9a159082d1ecaf9f943 /c_headers | |
| parent | abff1b688420eb30d98145d8bc48e7d08f259885 (diff) | |
| download | zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.tar.gz zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.zip | |
update to llvm master
Diffstat (limited to 'c_headers')
| -rw-r--r-- | c_headers/__clang_cuda_intrinsics.h | 124 | ||||
| -rw-r--r-- | c_headers/__clang_cuda_runtime_wrapper.h | 30 | ||||
| -rw-r--r-- | c_headers/arm64intr.h | 49 | ||||
| -rw-r--r-- | c_headers/avx2intrin.h | 12 | ||||
| -rw-r--r-- | c_headers/avx512bwintrin.h | 71 | ||||
| -rw-r--r-- | c_headers/avx512dqintrin.h | 38 | ||||
| -rw-r--r-- | c_headers/avx512fintrin.h | 54 | ||||
| -rw-r--r-- | c_headers/avx512vlbwintrin.h | 50 | ||||
| -rw-r--r-- | c_headers/avx512vldqintrin.h | 54 | ||||
| -rw-r--r-- | c_headers/avx512vlintrin.h | 69 | ||||
| -rw-r--r-- | c_headers/clflushoptintrin.h | 2 | ||||
| -rw-r--r-- | c_headers/clwbintrin.h | 52 | ||||
| -rw-r--r-- | c_headers/cuda_wrappers/new | 51 | ||||
| -rw-r--r-- | c_headers/emmintrin.h | 12 | ||||
| -rw-r--r-- | c_headers/float.h | 14 | ||||
| -rw-r--r-- | c_headers/immintrin.h | 4 | ||||
| -rw-r--r-- | c_headers/intrin.h | 6 | ||||
| -rw-r--r-- | c_headers/opencl-c.h | 359 | ||||
| -rw-r--r-- | c_headers/unwind.h | 80 |
19 files changed, 601 insertions, 530 deletions
diff --git a/c_headers/__clang_cuda_intrinsics.h b/c_headers/__clang_cuda_intrinsics.h index b43ce21d0b..bc5b876577 100644 --- a/c_headers/__clang_cuda_intrinsics.h +++ b/c_headers/__clang_cuda_intrinsics.h @@ -92,6 +92,130 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 +#if CUDA_VERSION >= 9000 +#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) +// __shfl_sync_* variants available in CUDA-9 +#pragma push_macro("__MAKE_SYNC_SHUFFLES") +#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ + __Mask) \ + inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \ + int __width = warpSize) { \ + return __IntIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ float __FnName(unsigned int __mask, float __val, \ + int __offset, int __width = warpSize) { \ + return __FloatIntrinsic(__mask, __val, __offset, \ + ((warpSize - __width) << 8) | (__Mask)); \ + } \ + inline __device__ unsigned int __FnName(unsigned int __mask, \ + unsigned int __val, int __offset, \ + int __width = warpSize) { \ + return static_cast<unsigned int>( \ + ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ + } \ + inline __device__ long long __FnName(unsigned int __mask, long long __val, \ + int __offset, int __width = warpSize) { \ + struct __Bits { \ + int __a, __b; \ + }; \ + _Static_assert(sizeof(__val) == sizeof(__Bits)); \ + _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ + __Bits __tmp; \ + memcpy(&__val, &__tmp, sizeof(__val)); \ + __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ + __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ + long long __ret; \ + memcpy(&__ret, &__tmp, sizeof(__tmp)); \ + return __ret; \ + } \ + inline __device__ unsigned long long __FnName( \ + unsigned int __mask, unsigned long long __val, int __offset, \ + int __width = warpSize) { \ + return static_cast<unsigned long long>(::__FnName( \ + __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ + } \ + inline __device__ double __FnName(unsigned int __mask, double __val, \ + int __offset, int __width = warpSize) { \ + long long __tmp; \ + _Static_assert(sizeof(__tmp) == sizeof(__val)); \ + memcpy(&__tmp, &__val, sizeof(__val)); \ + __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ + double __ret; \ + memcpy(&__ret, &__tmp, sizeof(__ret)); \ + return __ret; \ + } +__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, + __nvvm_shfl_sync_idx_f32, 0x1f); +// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= +// maxLane. +__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, + __nvvm_shfl_sync_up_f32, 0); +__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, + __nvvm_shfl_sync_down_f32, 0x1f); +__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, + __nvvm_shfl_sync_bfly_f32, 0x1f); +#pragma pop_macro("__MAKE_SYNC_SHUFFLES") + +inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { + return __nvvm_bar_warp_sync(mask); +} + +inline __device__ void __barrier_sync(unsigned int id) { + __nvvm_barrier_sync(id); +} + +inline __device__ void __barrier_sync_count(unsigned int id, + unsigned int count) { + __nvvm_barrier_sync_cnt(id, count); +} + +inline __device__ int __all_sync(unsigned int mask, int pred) { + return __nvvm_vote_all_sync(mask, pred); +} + +inline __device__ int __any_sync(unsigned int mask, int pred) { + return __nvvm_vote_any_sync(mask, pred); +} + +inline __device__ int __uni_sync(unsigned int mask, int pred) { + return __nvvm_vote_uni_sync(mask, pred); +} + +inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { + return __nvvm_vote_ballot_sync(mask, pred); +} + +inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +// Define __match* builtins CUDA-9 headers expect to see. +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +inline __device__ unsigned int __match32_any_sync(unsigned int mask, + unsigned int value) { + return __nvvm_match_any_sync_i32(mask, value); +} + +inline __device__ unsigned long long +__match64_any_sync(unsigned int mask, unsigned long long value) { + return __nvvm_match_any_sync_i64(mask, value); +} + +inline __device__ unsigned int +__match32_all_sync(unsigned int mask, unsigned int value, int *pred) { + return __nvvm_match_all_sync_i32p(mask, value, pred); +} + +inline __device__ unsigned long long +__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { + return __nvvm_match_all_sync_i64p(mask, value, pred); +} +#include "crt/sm_70_rt.hpp" + +#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 +#endif // __CUDA_VERSION >= 9000 + // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. // Prevent the vanilla sm_32 intrinsics header from being included. diff --git a/c_headers/__clang_cuda_runtime_wrapper.h b/c_headers/__clang_cuda_runtime_wrapper.h index 931d44b696..b8ffc2ce9f 100644 --- a/c_headers/__clang_cuda_runtime_wrapper.h +++ b/c_headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000 #error "Unsupported CUDA version!" #endif @@ -86,7 +86,11 @@ #define __COMMON_FUNCTIONS_H__ #undef __CUDACC__ +#if CUDA_VERSION < 9000 #define __CUDABE__ +#else +#define __CUDA_LIBDEVICE__ +#endif // Disables definitions of device-side runtime support stubs in // cuda_device_runtime_api.h #include "driver_types.h" @@ -94,6 +98,7 @@ #include "host_defines.h" #undef __CUDABE__ +#undef __CUDA_LIBDEVICE__ #define __CUDACC__ #include "cuda_runtime.h" @@ -105,7 +110,9 @@ #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n) #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n) +#if CUDA_VERSION < 9000 #include "crt/device_runtime.h" +#endif #include "crt/host_runtime.h" // device_runtime.h defines __cxa_* macros that will conflict with // cxxabi.h. @@ -166,7 +173,18 @@ inline __host__ double __signbitd(double x) { // __device__. #pragma push_macro("__forceinline__") #define __forceinline__ __device__ __inline__ __attribute__((always_inline)) + +#pragma push_macro("__float2half_rn") +#if CUDA_VERSION >= 9000 +// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in +// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in +// device_functions.hpp out of the way. +#define __float2half_rn __float2half_rn_disabled +#endif + #include "device_functions.hpp" +#pragma pop_macro("__float2half_rn") + // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we // get the slow-but-accurate or fast-but-inaccurate versions of functions like @@ -247,7 +265,17 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); } #pragma push_macro("__GNUC__") #undef __GNUC__ #define signbit __ignored_cuda_signbit + +// CUDA-9 omits device-side definitions of some math functions if it sees +// include guard from math.h wrapper from libstdc++. We have to undo the header +// guard temporarily to get the definitions we need. +#pragma push_macro("_GLIBCXX_MATH_H") +#if CUDA_VERSION >= 9000 +#undef _GLIBCXX_MATH_H +#endif + #include "math_functions.hpp" +#pragma pop_macro("_GLIBCXX_MATH_H") #pragma pop_macro("__GNUC__") #pragma pop_macro("signbit") diff --git a/c_headers/arm64intr.h b/c_headers/arm64intr.h new file mode 100644 index 0000000000..be52283618 --- /dev/null +++ b/c_headers/arm64intr.h @@ -0,0 +1,49 @@ +/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +/* Only include this if we're compiling for the windows platform. */ +#ifndef _MSC_VER +#include_next <arm64intr.h> +#else + +#ifndef __ARM64INTR_H +#define __ARM64INTR_H + +typedef enum +{ + _ARM64_BARRIER_SY = 0xF, + _ARM64_BARRIER_ST = 0xE, + _ARM64_BARRIER_LD = 0xD, + _ARM64_BARRIER_ISH = 0xB, + _ARM64_BARRIER_ISHST = 0xA, + _ARM64_BARRIER_ISHLD = 0x9, + _ARM64_BARRIER_NSH = 0x7, + _ARM64_BARRIER_NSHST = 0x6, + _ARM64_BARRIER_NSHLD = 0x5, + _ARM64_BARRIER_OSH = 0x3, + _ARM64_BARRIER_OSHST = 0x2, + _ARM64_BARRIER_OSHLD = 0x1 +} _ARM64INTR_BARRIER_TYPE; + +#endif /* __ARM64INTR_H */ +#endif /* _MSC_VER */ diff --git a/c_headers/avx2intrin.h b/c_headers/avx2intrin.h index 576f761b25..caf4ced920 100644 --- a/c_headers/avx2intrin.h +++ b/c_headers/avx2intrin.h @@ -145,13 +145,21 @@ _mm256_andnot_si256(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_avg_epu8(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); + typedef unsigned short __v32hu __attribute__((__vector_size__(64))); + return (__m256i)__builtin_convertvector( + ((__builtin_convertvector((__v32qu)__a, __v32hu) + + __builtin_convertvector((__v32qu)__b, __v32hu)) + 1) + >> 1, __v32qu); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_avg_epu16(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); + typedef unsigned int __v16su __attribute__((__vector_size__(64))); + return (__m256i)__builtin_convertvector( + ((__builtin_convertvector((__v16hu)__a, __v16su) + + __builtin_convertvector((__v16hu)__b, __v16su)) + 1) + >> 1, __v16hu); } static __inline__ __m256i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512bwintrin.h b/c_headers/avx512bwintrin.h index 41958b7214..53da5869d3 100644 --- a/c_headers/avx512bwintrin.h +++ b/c_headers/avx512bwintrin.h @@ -706,57 +706,55 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_avg_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + typedef unsigned short __v64hu __attribute__((__vector_size__(128))); + return (__m512i)__builtin_convertvector( + ((__builtin_convertvector((__v64qu) __A, __v64hu) + + __builtin_convertvector((__v64qu) __B, __v64hu)) + 1) + >> 1, __v64qu); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)_mm512_setzero_qi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_avg_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + typedef unsigned int __v32su __attribute__((__vector_size__(128))); + return (__m512i)__builtin_convertvector( + ((__builtin_convertvector((__v32hu) __A, __v32su) + + __builtin_convertvector((__v32hu) __B, __v32su)) + 1) + >> 1, __v32hu); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi) _mm512_setzero_hi()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -2028,18 +2026,17 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) { - return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, - (__v64qi) __O, - __M); + return (__m512i) __builtin_ia32_selectb_512(__M, + (__v64qi)_mm512_set1_epi8(__A), + (__v64qi) __O); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) { - return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, - (__v64qi) - _mm512_setzero_qi(), - __M); + return (__m512i) __builtin_ia32_selectb_512(__M, + (__v64qi) _mm512_set1_epi8(__A), + (__v64qi) _mm512_setzero_si512()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS @@ -2219,17 +2216,17 @@ _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) { - return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, - (__v32hi) __O, - __M); + return (__m512i) __builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_set1_epi16(__A), + (__v32hi) __O); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) { - return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, - (__v32hi) _mm512_setzero_hi(), - __M); + return (__m512i) __builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_set1_epi16(__A), + (__v32hi) _mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512dqintrin.h b/c_headers/avx512dqintrin.h index 4fd1add773..2c431d9740 100644 --- a/c_headers/avx512dqintrin.h +++ b/c_headers/avx512dqintrin.h @@ -973,25 +973,26 @@ _mm512_movepi64_mask (__m512i __A) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_broadcast_f32x2 (__m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf)_mm512_undefined_ps(), - (__mmask16) -1); + return (__m512)__builtin_shufflevector((__v4sf)__A, + (__v4sf)_mm_undefined_ps(), + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf) - __O, __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)__O); } static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) { - return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, - (__v16sf)_mm512_setzero_ps (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)_mm512_setzero_ps()); } static __inline__ __m512 __DEFAULT_FN_ATTRS @@ -1044,25 +1045,26 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcast_i32x2 (__m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si)_mm512_setzero_si512(), - (__mmask16) -1); + return (__m512i)__builtin_shufflevector((__v4si)__A, + (__v4si)_mm_undefined_si128(), + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si) - __O, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)__O); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) { - return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, - (__v16si)_mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512fintrin.h b/c_headers/avx512fintrin.h index 4ce6945311..247ac879ea 100644 --- a/c_headers/avx512fintrin.h +++ b/c_headers/avx512fintrin.h @@ -258,30 +258,6 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) (__v8di) _mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_set1_epi32(__mmask16 __M, int __A) -{ - return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, - (__v16si) - _mm512_setzero_si512 (), - __M); -} - -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) -{ -#ifdef __x86_64__ - return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, - (__v8di) - _mm512_setzero_si512 (), - __M); -#else - return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A, - (__v8di) - _mm512_setzero_si512 (), - __M); -#endif -} static __inline __m512 __DEFAULT_FN_ATTRS _mm512_setzero_ps(void) @@ -341,11 +317,29 @@ _mm512_set1_epi32(int __s) } static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_set1_epi32(__mmask16 __M, int __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__M, + (__v16si)_mm512_set1_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS _mm512_set1_epi64(long long __d) { return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; } +#ifdef __x86_64__ +static __inline __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_set1_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} +#endif + static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_broadcastss_ps(__m128 __A) { @@ -9040,7 +9034,7 @@ _mm512_stream_si512 (__m512i * __P, __m512i __A) } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_stream_load_si512 (void *__P) +_mm512_stream_load_si512 (void const *__P) { typedef __v8di __v8di_aligned __attribute__((aligned(64))); return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); @@ -9742,16 +9736,18 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) { - return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, - __M); + return (__m512i) __builtin_ia32_selectd_512(__M, + (__v16si) _mm512_set1_epi32(__A), + (__v16si) __O); } #ifdef __x86_64__ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { - return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, - __M); + return (__m512i) __builtin_ia32_selectq_512(__M, + (__v8di) _mm512_set1_epi64(__A), + (__v8di) __O); } #endif diff --git a/c_headers/avx512vlbwintrin.h b/c_headers/avx512vlbwintrin.h index 3b58d04339..4ab785bdbb 100644 --- a/c_headers/avx512vlbwintrin.h +++ b/c_headers/avx512vlbwintrin.h @@ -2660,35 +2660,33 @@ _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { - return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, - (__v16qi) __O, - __M); + return (__m128i) __builtin_ia32_selectb_128(__M, + (__v16qi) _mm_set1_epi8(__A), + (__v16qi) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi8 (__mmask16 __M, char __A) { - return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, - (__v16qi) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectb_128(__M, + (__v16qi) _mm_set1_epi8(__A), + (__v16qi) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) { - return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, - (__v32qi) __O, - __M); + return (__m256i) __builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_set1_epi8(__A), + (__v32qi) __O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) { - return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, - (__v32qi) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_set1_epi8(__A), + (__v32qi) _mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS @@ -3025,33 +3023,33 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) { - return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, - (__v16hi) __O, - __M); + return (__m256i) __builtin_ia32_selectw_256 (__M, + (__v16hi) _mm256_set1_epi16(__A), + (__v16hi) __O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) { - return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, - (__v16hi) _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_selectw_256(__M, + (__v16hi)_mm256_set1_epi16(__A), + (__v16hi) _mm256_setzero_si256()); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) { - return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, - (__v8hi) __O, - __M); + return (__m128i) __builtin_ia32_selectw_128(__M, + (__v8hi) _mm_set1_epi16(__A), + (__v8hi) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi16 (__mmask8 __M, short __A) { - return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, - (__v8hi) _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectw_128(__M, + (__v8hi) _mm_set1_epi16(__A), + (__v8hi) _mm_setzero_si128()); } static __inline__ __m128i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vldqintrin.h b/c_headers/avx512vldqintrin.h index aecd7df34d..d80df9eaff 100644 --- a/c_headers/avx512vldqintrin.h +++ b/c_headers/avx512vldqintrin.h @@ -978,25 +978,25 @@ _mm256_movepi64_mask (__m256i __A) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_f32x2 (__m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, - (__v8sf)_mm256_undefined_ps(), - (__mmask8) -1); + return (__m256)__builtin_shufflevector((__v4sf)__A, + (__v4sf)_mm_undefined_ps(), + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, - (__v8sf) __O, - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)__O); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) { - return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, - (__v8sf) _mm256_setzero_ps (), - __M); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m256d __DEFAULT_FN_ATTRS @@ -1025,49 +1025,49 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcast_i32x2 (__m128i __A) { - return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector((__v4si)__A, + (__v4si)_mm_undefined_si128(), + 0, 1, 0, 1); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A, - (__v4si) __O, - __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_broadcast_i32x2(__A), + (__v4si)__O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A, - (__v4si) _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_broadcast_i32x2(__A), + (__v4si)_mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcast_i32x2 (__m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A, - (__v8si)_mm256_undefined_si256(), - (__mmask8) -1); + return (__m256i)__builtin_shufflevector((__v4si)__A, + (__v4si)_mm_undefined_si128(), + 0, 1, 0, 1, 0, 1, 0, 1); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A, - (__v8si) __O, - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)__O); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { - return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A, - (__v8si) _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)_mm256_setzero_si256()); } static __inline__ __m256i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vlintrin.h b/c_headers/avx512vlintrin.h index 99bb050de4..7e17cff05f 100644 --- a/c_headers/avx512vlintrin.h +++ b/c_headers/avx512vlintrin.h @@ -5723,59 +5723,72 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) (__v4df)_mm256_setzero_pd()); } +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)__O); +} -#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \ - (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \ - (__v4si)(__m128i)(O), \ - (__mmask8)(M)); }) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)_mm_setzero_si128()); +} -#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \ - (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(M)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)__O); +} -#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \ - (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \ - (__v8si)(__m256i)(O), \ - (__mmask8)(M)); }) +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} -#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \ - (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(M)); }) #ifdef __x86_64__ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, - __M); + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi64x(__A), + (__v2di) __O); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi64x(__A), + (__v2di) _mm_setzero_si128()); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, - __M); + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) __O) ; } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { - return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) _mm256_setzero_si256()); } + #endif #define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \ diff --git a/c_headers/clflushoptintrin.h b/c_headers/clflushoptintrin.h index 60e0ead762..f1f1330234 100644 --- a/c_headers/clflushoptintrin.h +++ b/c_headers/clflushoptintrin.h @@ -32,7 +32,7 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt"))) static __inline__ void __DEFAULT_FN_ATTRS -_mm_clflushopt(char * __m) { +_mm_clflushopt(void const * __m) { __builtin_ia32_clflushopt(__m); } diff --git a/c_headers/clwbintrin.h b/c_headers/clwbintrin.h new file mode 100644 index 0000000000..2594a6c387 --- /dev/null +++ b/c_headers/clwbintrin.h @@ -0,0 +1,52 @@ +/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __CLWBINTRIN_H +#define __CLWBINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb"))) + +/// \brief Writes back to memory the cache line (if modified) that contains the +/// linear address specified in \a __p from any level of the cache hierarchy in +/// the cache coherence domain +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> CLWB </c> instruction. +/// +/// \param __p +/// A pointer to the memory location used to identify the cache line to be +/// written back. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_clwb(void const *__p) { + __builtin_ia32_clwb(__p); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/cuda_wrappers/new b/c_headers/cuda_wrappers/new index b77131af0e..71b7a52363 100644 --- a/c_headers/cuda_wrappers/new +++ b/c_headers/cuda_wrappers/new @@ -26,7 +26,6 @@ #include_next <new> -// Device overrides for placement new and delete. #pragma push_macro("CUDA_NOEXCEPT") #if __cplusplus >= 201103L #define CUDA_NOEXCEPT noexcept @@ -34,6 +33,55 @@ #define CUDA_NOEXCEPT #endif +// Device overrides for non-placement new and delete. +__device__ inline void *operator new(__SIZE_TYPE__ size) { + if (size == 0) { + size = 1; + } + return ::malloc(size); +} +__device__ inline void *operator new(__SIZE_TYPE__ size, + const std::nothrow_t &) CUDA_NOEXCEPT { + return ::operator new(size); +} + +__device__ inline void *operator new[](__SIZE_TYPE__ size) { + return ::operator new(size); +} +__device__ inline void *operator new[](__SIZE_TYPE__ size, + const std::nothrow_t &) { + return ::operator new(size); +} + +__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT { + if (ptr) { + ::free(ptr); + } +} +__device__ inline void operator delete(void *ptr, + const std::nothrow_t &) CUDA_NOEXCEPT { + ::operator delete(ptr); +} + +__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT { + ::operator delete(ptr); +} +__device__ inline void operator delete[](void *ptr, + const std::nothrow_t &) CUDA_NOEXCEPT { + ::operator delete(ptr); +} + +// Sized delete, C++14 only. +#if __cplusplus >= 201402L +__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { + ::operator delete(ptr); +} +__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT { + ::operator delete(ptr); +} +#endif + +// Device overrides for placement new and delete. __device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT { return __ptr; } @@ -42,6 +90,7 @@ __device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT } __device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {} __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {} + #pragma pop_macro("CUDA_NOEXCEPT") #endif // include guard diff --git a/c_headers/emmintrin.h b/c_headers/emmintrin.h index 709815cbb4..3372508a7f 100644 --- a/c_headers/emmintrin.h +++ b/c_headers/emmintrin.h @@ -2258,7 +2258,11 @@ _mm_adds_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); + typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); + return (__m128i)__builtin_convertvector( + ((__builtin_convertvector((__v16qu)__a, __v16hu) + + __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) + >> 1, __v16qu); } /// \brief Computes the rounded avarages of corresponding elements of two @@ -2278,7 +2282,11 @@ _mm_avg_epu8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { - return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); + typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); + return (__m128i)__builtin_convertvector( + ((__builtin_convertvector((__v8hu)__a, __v8su) + + __builtin_convertvector((__v8hu)__b, __v8su)) + 1) + >> 1, __v8hu); } /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] diff --git a/c_headers/float.h b/c_headers/float.h index 502143d4e4..44d4d05494 100644 --- a/c_headers/float.h +++ b/c_headers/float.h @@ -143,4 +143,18 @@ # define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__ #endif +#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__ +# define FLT16_MANT_DIG __FLT16_MANT_DIG__ +# define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__ +# define FLT16_DIG __FLT16_DIG__ +# define FLT16_MIN_EXP __FLT16_MIN_EXP__ +# define FLT16_MIN_10_EXP __FLT16_MIN_10_EXP__ +# define FLT16_MAX_EXP __FLT16_MAX_EXP__ +# define FLT16_MAX_10_EXP __FLT16_MAX_10_EXP__ +# define FLT16_MAX __FLT16_MAX__ +# define FLT16_EPSILON __FLT16_EPSILON__ +# define FLT16_MIN __FLT16_MIN__ +# define FLT16_TRUE_MIN __FLT16_TRUE_MIN__ +#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */ + #endif /* __FLOAT_H */ diff --git a/c_headers/immintrin.h b/c_headers/immintrin.h index c5f25bfcb5..d86e0efb82 100644 --- a/c_headers/immintrin.h +++ b/c_headers/immintrin.h @@ -58,6 +58,10 @@ #include <clflushoptintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__) +#include <clwbintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__) #include <avxintrin.h> #endif diff --git a/c_headers/intrin.h b/c_headers/intrin.h index 881d05c0d1..b30aa215a4 100644 --- a/c_headers/intrin.h +++ b/c_headers/intrin.h @@ -38,6 +38,10 @@ #include <armintr.h> #endif +#if defined(_M_ARM64) +#include <arm64intr.h> +#endif + /* For the definition of jmp_buf. */ #if __STDC_HOSTED__ #include <setjmp.h> @@ -828,7 +832,7 @@ _InterlockedCompareExchange_nf(long volatile *_Destination, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); return _Comparand; } -static __inline__ short __DEFAULT_FN_ATTRS +static __inline__ long __DEFAULT_FN_ATTRS _InterlockedCompareExchange_rel(long volatile *_Destination, long _Exchange, long _Comparand) { __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0, diff --git a/c_headers/opencl-c.h b/c_headers/opencl-c.h index 58c8daf3a5..35fb0a82bc 100644 --- a/c_headers/opencl-c.h +++ b/c_headers/opencl-c.h @@ -11381,6 +11381,8 @@ half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c); * For each component of a vector type, * result[i] = if MSB of c[i] is set ? b[i] : a[i]. * For a scalar type, result = c ? b : a. + * b and a must have the same type. + * c must have the same number of elements and bits as a. */ char __ovld __cnfn select(char a, char b, char c); uchar __ovld __cnfn select(uchar a, uchar b, char c); @@ -11394,60 +11396,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, char8 c); uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c); char16 __ovld __cnfn select(char16 a, char16 b, char16 c); uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c); -short __ovld __cnfn select(short a, short b, char c); -ushort __ovld __cnfn select(ushort a, ushort b, char c); -short2 __ovld __cnfn select(short2 a, short2 b, char2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c); -short3 __ovld __cnfn select(short3 a, short3 b, char3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c); -short4 __ovld __cnfn select(short4 a, short4 b, char4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c); -short8 __ovld __cnfn select(short8 a, short8 b, char8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c); -short16 __ovld __cnfn select(short16 a, short16 b, char16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c); -int __ovld __cnfn select(int a, int b, char c); -uint __ovld __cnfn select(uint a, uint b, char c); -int2 __ovld __cnfn select(int2 a, int2 b, char2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c); -int3 __ovld __cnfn select(int3 a, int3 b, char3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c); -int4 __ovld __cnfn select(int4 a, int4 b, char4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c); -int8 __ovld __cnfn select(int8 a, int8 b, char8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c); -int16 __ovld __cnfn select(int16 a, int16 b, char16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c); -long __ovld __cnfn select(long a, long b, char c); -ulong __ovld __cnfn select(ulong a, ulong b, char c); -long2 __ovld __cnfn select(long2 a, long2 b, char2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c); -long3 __ovld __cnfn select(long3 a, long3 b, char3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c); -long4 __ovld __cnfn select(long4 a, long4 b, char4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c); -long8 __ovld __cnfn select(long8 a, long8 b, char8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c); -long16 __ovld __cnfn select(long16 a, long16 b, char16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c); -float __ovld __cnfn select(float a, float b, char c); -float2 __ovld __cnfn select(float2 a, float2 b, char2 c); -float3 __ovld __cnfn select(float3 a, float3 b, char3 c); -float4 __ovld __cnfn select(float4 a, float4 b, char4 c); -float8 __ovld __cnfn select(float8 a, float8 b, char8 c); -float16 __ovld __cnfn select(float16 a, float16 b, char16 c); -char __ovld __cnfn select(char a, char b, short c); -uchar __ovld __cnfn select(uchar a, uchar b, short c); -char2 __ovld __cnfn select(char2 a, char2 b, short2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c); -char3 __ovld __cnfn select(char3 a, char3 b, short3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c); -char4 __ovld __cnfn select(char4 a, char4 b, short4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c); -char8 __ovld __cnfn select(char8 a, char8 b, short8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c); -char16 __ovld __cnfn select(char16 a, char16 b, short16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c); + short __ovld __cnfn select(short a, short b, short c); ushort __ovld __cnfn select(ushort a, ushort b, short c); short2 __ovld __cnfn select(short2 a, short2 b, short2 c); @@ -11460,60 +11409,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, short8 c); ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c); short16 __ovld __cnfn select(short16 a, short16 b, short16 c); ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c); -int __ovld __cnfn select(int a, int b, short c); -uint __ovld __cnfn select(uint a, uint b, short c); -int2 __ovld __cnfn select(int2 a, int2 b, short2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c); -int3 __ovld __cnfn select(int3 a, int3 b, short3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c); -int4 __ovld __cnfn select(int4 a, int4 b, short4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c); -int8 __ovld __cnfn select(int8 a, int8 b, short8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c); -int16 __ovld __cnfn select(int16 a, int16 b, short16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c); -long __ovld __cnfn select(long a, long b, short c); -ulong __ovld __cnfn select(ulong a, ulong b, short c); -long2 __ovld __cnfn select(long2 a, long2 b, short2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c); -long3 __ovld __cnfn select(long3 a, long3 b, short3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c); -long4 __ovld __cnfn select(long4 a, long4 b, short4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c); -long8 __ovld __cnfn select(long8 a, long8 b, short8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c); -long16 __ovld __cnfn select(long16 a, long16 b, short16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c); -float __ovld __cnfn select(float a, float b, short c); -float2 __ovld __cnfn select(float2 a, float2 b, short2 c); -float3 __ovld __cnfn select(float3 a, float3 b, short3 c); -float4 __ovld __cnfn select(float4 a, float4 b, short4 c); -float8 __ovld __cnfn select(float8 a, float8 b, short8 c); -float16 __ovld __cnfn select(float16 a, float16 b, short16 c); -char __ovld __cnfn select(char a, char b, int c); -uchar __ovld __cnfn select(uchar a, uchar b, int c); -char2 __ovld __cnfn select(char2 a, char2 b, int2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c); -char3 __ovld __cnfn select(char3 a, char3 b, int3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c); -char4 __ovld __cnfn select(char4 a, char4 b, int4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c); -char8 __ovld __cnfn select(char8 a, char8 b, int8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c); -char16 __ovld __cnfn select(char16 a, char16 b, int16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c); -short __ovld __cnfn select(short a, short b, int c); -ushort __ovld __cnfn select(ushort a, ushort b, int c); -short2 __ovld __cnfn select(short2 a, short2 b, int2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c); -short3 __ovld __cnfn select(short3 a, short3 b, int3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c); -short4 __ovld __cnfn select(short4 a, short4 b, int4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c); -short8 __ovld __cnfn select(short8 a, short8 b, int8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c); -short16 __ovld __cnfn select(short16 a, short16 b, int16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c); + int __ovld __cnfn select(int a, int b, int c); uint __ovld __cnfn select(uint a, uint b, int c); int2 __ovld __cnfn select(int2 a, int2 b, int2 c); @@ -11526,60 +11422,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, int8 c); uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c); int16 __ovld __cnfn select(int16 a, int16 b, int16 c); uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c); -long __ovld __cnfn select(long a, long b, int c); -ulong __ovld __cnfn select(ulong a, ulong b, int c); -long2 __ovld __cnfn select(long2 a, long2 b, int2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c); -long3 __ovld __cnfn select(long3 a, long3 b, int3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c); -long4 __ovld __cnfn select(long4 a, long4 b, int4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c); -long8 __ovld __cnfn select(long8 a, long8 b, int8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c); -long16 __ovld __cnfn select(long16 a, long16 b, int16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c); float __ovld __cnfn select(float a, float b, int c); float2 __ovld __cnfn select(float2 a, float2 b, int2 c); float3 __ovld __cnfn select(float3 a, float3 b, int3 c); float4 __ovld __cnfn select(float4 a, float4 b, int4 c); float8 __ovld __cnfn select(float8 a, float8 b, int8 c); float16 __ovld __cnfn select(float16 a, float16 b, int16 c); -char __ovld __cnfn select(char a, char b, long c); -uchar __ovld __cnfn select(uchar a, uchar b, long c); -char2 __ovld __cnfn select(char2 a, char2 b, long2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c); -char3 __ovld __cnfn select(char3 a, char3 b, long3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c); -char4 __ovld __cnfn select(char4 a, char4 b, long4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c); -char8 __ovld __cnfn select(char8 a, char8 b, long8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c); -char16 __ovld __cnfn select(char16 a, char16 b, long16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c); -short __ovld __cnfn select(short a, short b, long c); -ushort __ovld __cnfn select(ushort a, ushort b, long c); -short2 __ovld __cnfn select(short2 a, short2 b, long2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c); -short3 __ovld __cnfn select(short3 a, short3 b, long3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c); -short4 __ovld __cnfn select(short4 a, short4 b, long4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c); -short8 __ovld __cnfn select(short8 a, short8 b, long8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c); -short16 __ovld __cnfn select(short16 a, short16 b, long16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c); -int __ovld __cnfn select(int a, int b, long c); -uint __ovld __cnfn select(uint a, uint b, long c); -int2 __ovld __cnfn select(int2 a, int2 b, long2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c); -int3 __ovld __cnfn select(int3 a, int3 b, long3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c); -int4 __ovld __cnfn select(int4 a, int4 b, long4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c); -int8 __ovld __cnfn select(int8 a, int8 b, long8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c); -int16 __ovld __cnfn select(int16 a, int16 b, long16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c); + long __ovld __cnfn select(long a, long b, long c); ulong __ovld __cnfn select(ulong a, ulong b, long c); long2 __ovld __cnfn select(long2 a, long2 b, long2 c); @@ -11592,12 +11441,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, long8 c); ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c); long16 __ovld __cnfn select(long16 a, long16 b, long16 c); ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c); -float __ovld __cnfn select(float a, float b, long c); -float2 __ovld __cnfn select(float2 a, float2 b, long2 c); -float3 __ovld __cnfn select(float3 a, float3 b, long3 c); -float4 __ovld __cnfn select(float4 a, float4 b, long4 c); -float8 __ovld __cnfn select(float8 a, float8 b, long8 c); -float16 __ovld __cnfn select(float16 a, float16 b, long16 c); + char __ovld __cnfn select(char a, char b, uchar c); uchar __ovld __cnfn select(uchar a, uchar b, uchar c); char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c); @@ -11610,60 +11454,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c); uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c); char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c); uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c); -short __ovld __cnfn select(short a, short b, uchar c); -ushort __ovld __cnfn select(ushort a, ushort b, uchar c); -short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c); -short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c); -short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c); -short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c); -short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c); -int __ovld __cnfn select(int a, int b, uchar c); -uint __ovld __cnfn select(uint a, uint b, uchar c); -int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c); -int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c); -int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c); -int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c); -int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c); -long __ovld __cnfn select(long a, long b, uchar c); -ulong __ovld __cnfn select(ulong a, ulong b, uchar c); -long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c); -long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c); -long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c); -long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c); -long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c); -float __ovld __cnfn select(float a, float b, uchar c); -float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c); -float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c); -float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c); -float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c); -float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c); -char __ovld __cnfn select(char a, char b, ushort c); -uchar __ovld __cnfn select(uchar a, uchar b, ushort c); -char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c); -char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c); -char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c); -char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c); -char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c); + short __ovld __cnfn select(short a, short b, ushort c); ushort __ovld __cnfn select(ushort a, ushort b, ushort c); short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c); @@ -11676,60 +11467,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c); ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c); short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c); ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c); -int __ovld __cnfn select(int a, int b, ushort c); -uint __ovld __cnfn select(uint a, uint b, ushort c); -int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c); -int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c); -int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c); -int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c); -int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c); -long __ovld __cnfn select(long a, long b, ushort c); -ulong __ovld __cnfn select(ulong a, ulong b, ushort c); -long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c); -long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c); -long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c); -long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c); -long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c); -float __ovld __cnfn select(float a, float b, ushort c); -float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c); -float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c); -float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c); -float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c); -float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c); -char __ovld __cnfn select(char a, char b, uint c); -uchar __ovld __cnfn select(uchar a, uchar b, uint c); -char2 __ovld __cnfn select(char2 a, char2 b, uint2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c); -char3 __ovld __cnfn select(char3 a, char3 b, uint3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c); -char4 __ovld __cnfn select(char4 a, char4 b, uint4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c); -char8 __ovld __cnfn select(char8 a, char8 b, uint8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c); -char16 __ovld __cnfn select(char16 a, char16 b, uint16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c); -short __ovld __cnfn select(short a, short b, uint c); -ushort __ovld __cnfn select(ushort a, ushort b, uint c); -short2 __ovld __cnfn select(short2 a, short2 b, uint2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c); -short3 __ovld __cnfn select(short3 a, short3 b, uint3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c); -short4 __ovld __cnfn select(short4 a, short4 b, uint4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c); -short8 __ovld __cnfn select(short8 a, short8 b, uint8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c); -short16 __ovld __cnfn select(short16 a, short16 b, uint16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c); + int __ovld __cnfn select(int a, int b, uint c); uint __ovld __cnfn select(uint a, uint b, uint c); int2 __ovld __cnfn select(int2 a, int2 b, uint2 c); @@ -11742,60 +11480,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, uint8 c); uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c); int16 __ovld __cnfn select(int16 a, int16 b, uint16 c); uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c); -long __ovld __cnfn select(long a, long b, uint c); -ulong __ovld __cnfn select(ulong a, ulong b, uint c); -long2 __ovld __cnfn select(long2 a, long2 b, uint2 c); -ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c); -long3 __ovld __cnfn select(long3 a, long3 b, uint3 c); -ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c); -long4 __ovld __cnfn select(long4 a, long4 b, uint4 c); -ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c); -long8 __ovld __cnfn select(long8 a, long8 b, uint8 c); -ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c); -long16 __ovld __cnfn select(long16 a, long16 b, uint16 c); -ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c); float __ovld __cnfn select(float a, float b, uint c); float2 __ovld __cnfn select(float2 a, float2 b, uint2 c); float3 __ovld __cnfn select(float3 a, float3 b, uint3 c); float4 __ovld __cnfn select(float4 a, float4 b, uint4 c); float8 __ovld __cnfn select(float8 a, float8 b, uint8 c); float16 __ovld __cnfn select(float16 a, float16 b, uint16 c); -char __ovld __cnfn select(char a, char b, ulong c); -uchar __ovld __cnfn select(uchar a, uchar b, ulong c); -char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c); -uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c); -char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c); -uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c); -char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c); -uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c); -char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c); -uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c); -char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c); -uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c); -short __ovld __cnfn select(short a, short b, ulong c); -ushort __ovld __cnfn select(ushort a, ushort b, ulong c); -short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c); -ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c); -short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c); -ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c); -short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c); -ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c); -short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c); -ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c); -short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c); -ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c); -int __ovld __cnfn select(int a, int b, ulong c); -uint __ovld __cnfn select(uint a, uint b, ulong c); -int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c); -uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c); -int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c); -uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c); -int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c); -uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c); -int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c); -uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c); -int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c); -uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c); + long __ovld __cnfn select(long a, long b, ulong c); ulong __ovld __cnfn select(ulong a, ulong b, ulong c); long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c); @@ -11808,12 +11499,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c); ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c); long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c); ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c); -float __ovld __cnfn select(float a, float b, ulong c); -float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c); -float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c); -float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c); -float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c); -float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c); + #ifdef cl_khr_fp64 double __ovld __cnfn select(double a, double b, long c); double2 __ovld __cnfn select(double2 a, double2 b, long2 c); @@ -13141,13 +12827,14 @@ void __ovld __conv barrier(cl_mem_fence_flags flags); #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 -typedef enum memory_scope -{ - memory_scope_work_item, - memory_scope_work_group, - memory_scope_device, - memory_scope_all_svm_devices, - memory_scope_sub_group +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif } memory_scope; void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope); @@ -13952,11 +13639,11 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v // enum values aligned with what clang uses in EmitAtomicExpr() typedef enum memory_order { - memory_order_relaxed, - memory_order_acquire, - memory_order_release, - memory_order_acq_rel, - memory_order_seq_cst + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST } memory_order; // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics diff --git a/c_headers/unwind.h b/c_headers/unwind.h index 4f74a34787..345fa4d0c1 100644 --- a/c_headers/unwind.h +++ b/c_headers/unwind.h @@ -76,7 +76,13 @@ typedef intptr_t _sleb128_t; typedef uintptr_t _uleb128_t; struct _Unwind_Context; +#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__)) +struct _Unwind_Control_Block; +typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */ +#else struct _Unwind_Exception; +typedef struct _Unwind_Exception _Unwind_Exception; +#endif typedef enum { _URC_NO_REASON = 0, #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \ @@ -109,8 +115,42 @@ typedef enum { } _Unwind_Action; typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code, - struct _Unwind_Exception *); - + _Unwind_Exception *); + +#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__)) +typedef struct _Unwind_Control_Block _Unwind_Control_Block; +typedef uint32_t _Unwind_EHT_Header; + +struct _Unwind_Control_Block { + uint64_t exception_class; + void (*exception_cleanup)(_Unwind_Reason_Code, _Unwind_Control_Block *); + /* unwinder cache (private fields for the unwinder's use) */ + struct { + uint32_t reserved1; /* forced unwind stop function, 0 if not forced */ + uint32_t reserved2; /* personality routine */ + uint32_t reserved3; /* callsite */ + uint32_t reserved4; /* forced unwind stop argument */ + uint32_t reserved5; + } unwinder_cache; + /* propagation barrier cache (valid after phase 1) */ + struct { + uint32_t sp; + uint32_t bitpattern[5]; + } barrier_cache; + /* cleanup cache (preserved over cleanup) */ + struct { + uint32_t bitpattern[4]; + } cleanup_cache; + /* personality cache (for personality's benefit) */ + struct { + uint32_t fnstart; /* function start address */ + _Unwind_EHT_Header *ehtp; /* pointer to EHT entry header word */ + uint32_t additional; /* additional data */ + uint32_t reserved1; + } pr_cache; + long long int : 0; /* force alignment of next item to 8-byte boundary */ +} __attribute__((__aligned__(8))); +#else struct _Unwind_Exception { _Unwind_Exception_Class exception_class; _Unwind_Exception_Cleanup_Fn exception_cleanup; @@ -120,23 +160,24 @@ struct _Unwind_Exception { * aligned". GCC has interpreted this to mean "use the maximum useful * alignment for the target"; so do we. */ } __attribute__((__aligned__)); +#endif typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action, _Unwind_Exception_Class, - struct _Unwind_Exception *, + _Unwind_Exception *, struct _Unwind_Context *, void *); -typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)( - int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *, - struct _Unwind_Context *); +typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(int, _Unwind_Action, + _Unwind_Exception_Class, + _Unwind_Exception *, + struct _Unwind_Context *); typedef _Unwind_Personality_Fn __personality_routine; typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *, void *); -#if defined(__arm__) && !defined(__APPLE__) - +#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__)) typedef enum { _UVRSC_CORE = 0, /* integer register */ _UVRSC_VFP = 1, /* vfp */ @@ -158,14 +199,12 @@ typedef enum { _UVRSR_FAILED = 2 } _Unwind_VRS_Result; -#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__) typedef uint32_t _Unwind_State; #define _US_VIRTUAL_UNWIND_FRAME ((_Unwind_State)0) #define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1) #define _US_UNWIND_FRAME_RESUME ((_Unwind_State)2) #define _US_ACTION_MASK ((_Unwind_State)3) #define _US_FORCE_UNWIND ((_Unwind_State)8) -#endif _Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context, _Unwind_VRS_RegClass __regclass, @@ -224,13 +263,12 @@ _Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *); /* DWARF EH functions; currently not available on Darwin/ARM */ #if !defined(__APPLE__) || !defined(__arm__) - -_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *, - _Unwind_Stop_Fn, void *); -void _Unwind_DeleteException(struct _Unwind_Exception *); -void _Unwind_Resume(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *); +_Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn, + void *); +void _Unwind_DeleteException(_Unwind_Exception *); +void _Unwind_Resume(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(_Unwind_Exception *); #endif @@ -241,11 +279,11 @@ typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t; void _Unwind_SjLj_Register(_Unwind_FunctionContext_t); void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t); -_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *, +_Unwind_Reason_Code _Unwind_SjLj_RaiseException(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn, void *); -void _Unwind_SjLj_Resume(struct _Unwind_Exception *); -_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *); +void _Unwind_SjLj_Resume(_Unwind_Exception *); +_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(_Unwind_Exception *); void *_Unwind_FindEnclosingFunction(void *); |
