aboutsummaryrefslogtreecommitdiff
path: root/c_headers
diff options
context:
space:
mode:
authorAndrew Kelley <superjoe30@gmail.com>2017-10-23 21:43:18 -0400
committerAndrew Kelley <superjoe30@gmail.com>2017-11-02 21:54:24 -0400
commit94ec2190f8d8c41d19b668511bf31fae32bcd095 (patch)
treec5ab9b20fbaf8f017661f9a159082d1ecaf9f943 /c_headers
parentabff1b688420eb30d98145d8bc48e7d08f259885 (diff)
downloadzig-94ec2190f8d8c41d19b668511bf31fae32bcd095.tar.gz
zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.zip
update to llvm master
Diffstat (limited to 'c_headers')
-rw-r--r--c_headers/__clang_cuda_intrinsics.h124
-rw-r--r--c_headers/__clang_cuda_runtime_wrapper.h30
-rw-r--r--c_headers/arm64intr.h49
-rw-r--r--c_headers/avx2intrin.h12
-rw-r--r--c_headers/avx512bwintrin.h71
-rw-r--r--c_headers/avx512dqintrin.h38
-rw-r--r--c_headers/avx512fintrin.h54
-rw-r--r--c_headers/avx512vlbwintrin.h50
-rw-r--r--c_headers/avx512vldqintrin.h54
-rw-r--r--c_headers/avx512vlintrin.h69
-rw-r--r--c_headers/clflushoptintrin.h2
-rw-r--r--c_headers/clwbintrin.h52
-rw-r--r--c_headers/cuda_wrappers/new51
-rw-r--r--c_headers/emmintrin.h12
-rw-r--r--c_headers/float.h14
-rw-r--r--c_headers/immintrin.h4
-rw-r--r--c_headers/intrin.h6
-rw-r--r--c_headers/opencl-c.h359
-rw-r--r--c_headers/unwind.h80
19 files changed, 601 insertions, 530 deletions
diff --git a/c_headers/__clang_cuda_intrinsics.h b/c_headers/__clang_cuda_intrinsics.h
index b43ce21d0b..bc5b876577 100644
--- a/c_headers/__clang_cuda_intrinsics.h
+++ b/c_headers/__clang_cuda_intrinsics.h
@@ -92,6 +92,130 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+#if CUDA_VERSION >= 9000
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+// __shfl_sync_* variants available in CUDA-9
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \
+ __Mask) \
+ inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
+ int __width = warpSize) { \
+ return __IntIntrinsic(__mask, __val, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ float __FnName(unsigned int __mask, float __val, \
+ int __offset, int __width = warpSize) { \
+ return __FloatIntrinsic(__mask, __val, __offset, \
+ ((warpSize - __width) << 8) | (__Mask)); \
+ } \
+ inline __device__ unsigned int __FnName(unsigned int __mask, \
+ unsigned int __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned int>( \
+ ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \
+ } \
+ inline __device__ long long __FnName(unsigned int __mask, long long __val, \
+ int __offset, int __width = warpSize) { \
+ struct __Bits { \
+ int __a, __b; \
+ }; \
+ _Static_assert(sizeof(__val) == sizeof(__Bits)); \
+ _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \
+ __Bits __tmp; \
+ memcpy(&__val, &__tmp, sizeof(__val)); \
+ __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \
+ __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \
+ long long __ret; \
+ memcpy(&__ret, &__tmp, sizeof(__tmp)); \
+ return __ret; \
+ } \
+ inline __device__ unsigned long long __FnName( \
+ unsigned int __mask, unsigned long long __val, int __offset, \
+ int __width = warpSize) { \
+ return static_cast<unsigned long long>(::__FnName( \
+ __mask, static_cast<unsigned long long>(__val), __offset, __width)); \
+ } \
+ inline __device__ double __FnName(unsigned int __mask, double __val, \
+ int __offset, int __width = warpSize) { \
+ long long __tmp; \
+ _Static_assert(sizeof(__tmp) == sizeof(__val)); \
+ memcpy(&__tmp, &__val, sizeof(__val)); \
+ __tmp = ::__FnName(__mask, __tmp, __offset, __width); \
+ double __ret; \
+ memcpy(&__ret, &__tmp, sizeof(__ret)); \
+ return __ret; \
+ }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+ __nvvm_shfl_sync_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
+ __nvvm_shfl_sync_up_f32, 0);
+__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
+ __nvvm_shfl_sync_down_f32, 0x1f);
+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
+ __nvvm_shfl_sync_bfly_f32, 0x1f);
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
+ return __nvvm_bar_warp_sync(mask);
+}
+
+inline __device__ void __barrier_sync(unsigned int id) {
+ __nvvm_barrier_sync(id);
+}
+
+inline __device__ void __barrier_sync_count(unsigned int id,
+ unsigned int count) {
+ __nvvm_barrier_sync_cnt(id, count);
+}
+
+inline __device__ int __all_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_all_sync(mask, pred);
+}
+
+inline __device__ int __any_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_any_sync(mask, pred);
+}
+
+inline __device__ int __uni_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_uni_sync(mask, pred);
+}
+
+inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
+ return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// Define __match* builtins CUDA-9 headers expect to see.
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+inline __device__ unsigned int __match32_any_sync(unsigned int mask,
+ unsigned int value) {
+ return __nvvm_match_any_sync_i32(mask, value);
+}
+
+inline __device__ unsigned long long
+__match64_any_sync(unsigned int mask, unsigned long long value) {
+ return __nvvm_match_any_sync_i64(mask, value);
+}
+
+inline __device__ unsigned int
+__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
+ return __nvvm_match_all_sync_i32p(mask, value, pred);
+}
+
+inline __device__ unsigned long long
+__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
+ return __nvvm_match_all_sync_i64p(mask, value, pred);
+}
+#include "crt/sm_70_rt.hpp"
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#endif // __CUDA_VERSION >= 9000
+
// sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
// Prevent the vanilla sm_32 intrinsics header from being included.
diff --git a/c_headers/__clang_cuda_runtime_wrapper.h b/c_headers/__clang_cuda_runtime_wrapper.h
index 931d44b696..b8ffc2ce9f 100644
--- a/c_headers/__clang_cuda_runtime_wrapper.h
+++ b/c_headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
#include "cuda.h"
#if !defined(CUDA_VERSION)
#error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
#error "Unsupported CUDA version!"
#endif
@@ -86,7 +86,11 @@
#define __COMMON_FUNCTIONS_H__
#undef __CUDACC__
+#if CUDA_VERSION < 9000
#define __CUDABE__
+#else
+#define __CUDA_LIBDEVICE__
+#endif
// Disables definitions of device-side runtime support stubs in
// cuda_device_runtime_api.h
#include "driver_types.h"
@@ -94,6 +98,7 @@
#include "host_defines.h"
#undef __CUDABE__
+#undef __CUDA_LIBDEVICE__
#define __CUDACC__
#include "cuda_runtime.h"
@@ -105,7 +110,9 @@
#define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
#define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
+#if CUDA_VERSION < 9000
#include "crt/device_runtime.h"
+#endif
#include "crt/host_runtime.h"
// device_runtime.h defines __cxa_* macros that will conflict with
// cxxabi.h.
@@ -166,7 +173,18 @@ inline __host__ double __signbitd(double x) {
// __device__.
#pragma push_macro("__forceinline__")
#define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+
+#pragma push_macro("__float2half_rn")
+#if CUDA_VERSION >= 9000
+// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
+// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
+// device_functions.hpp out of the way.
+#define __float2half_rn __float2half_rn_disabled
+#endif
+
#include "device_functions.hpp"
+#pragma pop_macro("__float2half_rn")
+
// math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
// get the slow-but-accurate or fast-but-inaccurate versions of functions like
@@ -247,7 +265,17 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
#pragma push_macro("__GNUC__")
#undef __GNUC__
#define signbit __ignored_cuda_signbit
+
+// CUDA-9 omits device-side definitions of some math functions if it sees
+// include guard from math.h wrapper from libstdc++. We have to undo the header
+// guard temporarily to get the definitions we need.
+#pragma push_macro("_GLIBCXX_MATH_H")
+#if CUDA_VERSION >= 9000
+#undef _GLIBCXX_MATH_H
+#endif
+
#include "math_functions.hpp"
+#pragma pop_macro("_GLIBCXX_MATH_H")
#pragma pop_macro("__GNUC__")
#pragma pop_macro("signbit")
diff --git a/c_headers/arm64intr.h b/c_headers/arm64intr.h
new file mode 100644
index 0000000000..be52283618
--- /dev/null
+++ b/c_headers/arm64intr.h
@@ -0,0 +1,49 @@
+/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <arm64intr.h>
+#else
+
+#ifndef __ARM64INTR_H
+#define __ARM64INTR_H
+
+typedef enum
+{
+ _ARM64_BARRIER_SY = 0xF,
+ _ARM64_BARRIER_ST = 0xE,
+ _ARM64_BARRIER_LD = 0xD,
+ _ARM64_BARRIER_ISH = 0xB,
+ _ARM64_BARRIER_ISHST = 0xA,
+ _ARM64_BARRIER_ISHLD = 0x9,
+ _ARM64_BARRIER_NSH = 0x7,
+ _ARM64_BARRIER_NSHST = 0x6,
+ _ARM64_BARRIER_NSHLD = 0x5,
+ _ARM64_BARRIER_OSH = 0x3,
+ _ARM64_BARRIER_OSHST = 0x2,
+ _ARM64_BARRIER_OSHLD = 0x1
+} _ARM64INTR_BARRIER_TYPE;
+
+#endif /* __ARM64INTR_H */
+#endif /* _MSC_VER */
diff --git a/c_headers/avx2intrin.h b/c_headers/avx2intrin.h
index 576f761b25..caf4ced920 100644
--- a/c_headers/avx2intrin.h
+++ b/c_headers/avx2intrin.h
@@ -145,13 +145,21 @@ _mm256_andnot_si256(__m256i __a, __m256i __b)
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_avg_epu8(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+ typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+ return (__m256i)__builtin_convertvector(
+ ((__builtin_convertvector((__v32qu)__a, __v32hu) +
+ __builtin_convertvector((__v32qu)__b, __v32hu)) + 1)
+ >> 1, __v32qu);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_avg_epu16(__m256i __a, __m256i __b)
{
- return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+ typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+ return (__m256i)__builtin_convertvector(
+ ((__builtin_convertvector((__v16hu)__a, __v16su) +
+ __builtin_convertvector((__v16hu)__b, __v16su)) + 1)
+ >> 1, __v16hu);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512bwintrin.h b/c_headers/avx512bwintrin.h
index 41958b7214..53da5869d3 100644
--- a/c_headers/avx512bwintrin.h
+++ b/c_headers/avx512bwintrin.h
@@ -706,57 +706,55 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_avg_epu8 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) _mm512_setzero_qi(),
- (__mmask64) -1);
+ typedef unsigned short __v64hu __attribute__((__vector_size__(128)));
+ return (__m512i)__builtin_convertvector(
+ ((__builtin_convertvector((__v64qu) __A, __v64hu) +
+ __builtin_convertvector((__v64qu) __B, __v64hu)) + 1)
+ >> 1, __v64qu);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
__m512i __B)
{
- return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) __W,
- (__mmask64) __U);
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_avg_epu8(__A, __B),
+ (__v64qi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
- (__v64qi) __B,
- (__v64qi) _mm512_setzero_qi(),
- (__mmask64) __U);
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_avg_epu8(__A, __B),
+ (__v64qi)_mm512_setzero_qi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_avg_epu16 (__m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) _mm512_setzero_hi(),
- (__mmask32) -1);
+ typedef unsigned int __v32su __attribute__((__vector_size__(128)));
+ return (__m512i)__builtin_convertvector(
+ ((__builtin_convertvector((__v32hu) __A, __v32su) +
+ __builtin_convertvector((__v32hu) __B, __v32su)) + 1)
+ >> 1, __v32hu);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
__m512i __B)
{
- return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) __W,
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_avg_epu16(__A, __B),
+ (__v32hi)__W);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
{
- return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
- (__v32hi) __B,
- (__v32hi) _mm512_setzero_hi(),
- (__mmask32) __U);
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_avg_epu16(__A, __B),
+ (__v32hi) _mm512_setzero_hi());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -2028,18 +2026,17 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
{
- return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
- (__v64qi) __O,
- __M);
+ return (__m512i) __builtin_ia32_selectb_512(__M,
+ (__v64qi)_mm512_set1_epi8(__A),
+ (__v64qi) __O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
{
- return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
- (__v64qi)
- _mm512_setzero_qi(),
- __M);
+ return (__m512i) __builtin_ia32_selectb_512(__M,
+ (__v64qi) _mm512_set1_epi8(__A),
+ (__v64qi) _mm512_setzero_si512());
}
static __inline__ __mmask64 __DEFAULT_FN_ATTRS
@@ -2219,17 +2216,17 @@ _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
{
- return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
- (__v32hi) __O,
- __M);
+ return (__m512i) __builtin_ia32_selectw_512(__M,
+ (__v32hi) _mm512_set1_epi16(__A),
+ (__v32hi) __O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
{
- return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
- (__v32hi) _mm512_setzero_hi(),
- __M);
+ return (__m512i) __builtin_ia32_selectw_512(__M,
+ (__v32hi) _mm512_set1_epi16(__A),
+ (__v32hi) _mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512dqintrin.h b/c_headers/avx512dqintrin.h
index 4fd1add773..2c431d9740 100644
--- a/c_headers/avx512dqintrin.h
+++ b/c_headers/avx512dqintrin.h
@@ -973,25 +973,26 @@ _mm512_movepi64_mask (__m512i __A)
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_broadcast_f32x2 (__m128 __A)
{
- return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
- (__v16sf)_mm512_undefined_ps(),
- (__mmask16) -1);
+ return (__m512)__builtin_shufflevector((__v4sf)__A,
+ (__v4sf)_mm_undefined_ps(),
+ 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
{
- return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
- (__v16sf)
- __O, __M);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x2(__A),
+ (__v16sf)__O);
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
{
- return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
- (__v16sf)_mm512_setzero_ps (),
- __M);
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x2(__A),
+ (__v16sf)_mm512_setzero_ps());
}
static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -1044,25 +1045,26 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_broadcast_i32x2 (__m128i __A)
{
- return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
- (__v16si)_mm512_setzero_si512(),
- (__mmask16) -1);
+ return (__m512i)__builtin_shufflevector((__v4si)__A,
+ (__v4si)_mm_undefined_si128(),
+ 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
{
- return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
- (__v16si)
- __O, __M);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x2(__A),
+ (__v16si)__O);
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
{
- return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
- (__v16si)_mm512_setzero_si512 (),
- __M);
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x2(__A),
+ (__v16si)_mm512_setzero_si512());
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512fintrin.h b/c_headers/avx512fintrin.h
index 4ce6945311..247ac879ea 100644
--- a/c_headers/avx512fintrin.h
+++ b/c_headers/avx512fintrin.h
@@ -258,30 +258,6 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
(__v8di) _mm512_setzero_si512());
}
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
- return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
- (__v16si)
- _mm512_setzero_si512 (),
- __M);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
-#ifdef __x86_64__
- return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
- (__v8di)
- _mm512_setzero_si512 (),
- __M);
-#else
- return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
- (__v8di)
- _mm512_setzero_si512 (),
- __M);
-#endif
-}
static __inline __m512 __DEFAULT_FN_ATTRS
_mm512_setzero_ps(void)
@@ -341,11 +317,29 @@ _mm512_set1_epi32(int __s)
}
static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__M,
+ (__v16si)_mm512_set1_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
_mm512_set1_epi64(long long __d)
{
return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
}
+#ifdef __x86_64__
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di)_mm512_set1_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+#endif
+
static __inline__ __m512 __DEFAULT_FN_ATTRS
_mm512_broadcastss_ps(__m128 __A)
{
@@ -9040,7 +9034,7 @@ _mm512_stream_si512 (__m512i * __P, __m512i __A)
}
static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_stream_load_si512 (void *__P)
+_mm512_stream_load_si512 (void const *__P)
{
typedef __v8di __v8di_aligned __attribute__((aligned(64)));
return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
@@ -9742,16 +9736,18 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
{
- return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
- __M);
+ return (__m512i) __builtin_ia32_selectd_512(__M,
+ (__v16si) _mm512_set1_epi32(__A),
+ (__v16si) __O);
}
#ifdef __x86_64__
static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
{
- return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
- __M);
+ return (__m512i) __builtin_ia32_selectq_512(__M,
+ (__v8di) _mm512_set1_epi64(__A),
+ (__v8di) __O);
}
#endif
diff --git a/c_headers/avx512vlbwintrin.h b/c_headers/avx512vlbwintrin.h
index 3b58d04339..4ab785bdbb 100644
--- a/c_headers/avx512vlbwintrin.h
+++ b/c_headers/avx512vlbwintrin.h
@@ -2660,35 +2660,33 @@ _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
{
- return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
- (__v16qi) __O,
- __M);
+ return (__m128i) __builtin_ia32_selectb_128(__M,
+ (__v16qi) _mm_set1_epi8(__A),
+ (__v16qi) __O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
{
- return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
- (__v16qi)
- _mm_setzero_si128 (),
- __M);
+ return (__m128i) __builtin_ia32_selectb_128(__M,
+ (__v16qi) _mm_set1_epi8(__A),
+ (__v16qi) _mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
{
- return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
- (__v32qi) __O,
- __M);
+ return (__m256i) __builtin_ia32_selectb_256(__M,
+ (__v32qi) _mm256_set1_epi8(__A),
+ (__v32qi) __O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
{
- return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
- (__v32qi)
- _mm256_setzero_si256 (),
- __M);
+ return (__m256i) __builtin_ia32_selectb_256(__M,
+ (__v32qi) _mm256_set1_epi8(__A),
+ (__v32qi) _mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3025,33 +3023,33 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
{
- return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
- (__v16hi) __O,
- __M);
+ return (__m256i) __builtin_ia32_selectw_256 (__M,
+ (__v16hi) _mm256_set1_epi16(__A),
+ (__v16hi) __O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
{
- return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
+ return (__m256i) __builtin_ia32_selectw_256(__M,
+ (__v16hi)_mm256_set1_epi16(__A),
+ (__v16hi) _mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
{
- return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
- (__v8hi) __O,
- __M);
+ return (__m128i) __builtin_ia32_selectw_128(__M,
+ (__v8hi) _mm_set1_epi16(__A),
+ (__v8hi) __O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
{
- return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
+ return (__m128i) __builtin_ia32_selectw_128(__M,
+ (__v8hi) _mm_set1_epi16(__A),
+ (__v8hi) _mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512vldqintrin.h b/c_headers/avx512vldqintrin.h
index aecd7df34d..d80df9eaff 100644
--- a/c_headers/avx512vldqintrin.h
+++ b/c_headers/avx512vldqintrin.h
@@ -978,25 +978,25 @@ _mm256_movepi64_mask (__m256i __A)
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_broadcast_f32x2 (__m128 __A)
{
- return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
- (__v8sf)_mm256_undefined_ps(),
- (__mmask8) -1);
+ return (__m256)__builtin_shufflevector((__v4sf)__A,
+ (__v4sf)_mm_undefined_ps(),
+ 0, 1, 0, 1, 0, 1, 0, 1);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
{
- return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
- (__v8sf) __O,
- __M);
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+ (__v8sf)_mm256_broadcast_f32x2(__A),
+ (__v8sf)__O);
}
static __inline__ __m256 __DEFAULT_FN_ATTRS
_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
{
- return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
- (__v8sf) _mm256_setzero_ps (),
- __M);
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+ (__v8sf)_mm256_broadcast_f32x2(__A),
+ (__v8sf)_mm256_setzero_ps());
}
static __inline__ __m256d __DEFAULT_FN_ATTRS
@@ -1025,49 +1025,49 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_broadcast_i32x2 (__m128i __A)
{
- return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
- (__v4si)_mm_undefined_si128(),
- (__mmask8) -1);
+ return (__m128i)__builtin_shufflevector((__v4si)__A,
+ (__v4si)_mm_undefined_si128(),
+ 0, 1, 0, 1);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
{
- return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
- (__v4si) __O,
- __M);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_broadcast_i32x2(__A),
+ (__v4si)__O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
{
- return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_broadcast_i32x2(__A),
+ (__v4si)_mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_broadcast_i32x2 (__m128i __A)
{
- return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
- (__v8si)_mm256_undefined_si256(),
- (__mmask8) -1);
+ return (__m256i)__builtin_shufflevector((__v4si)__A,
+ (__v4si)_mm_undefined_si128(),
+ 0, 1, 0, 1, 0, 1, 0, 1);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
{
- return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
- (__v8si) __O,
- __M);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_broadcast_i32x2(__A),
+ (__v8si)__O);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
{
- return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
- (__v8si) _mm256_setzero_si256 (),
- __M);
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_broadcast_i32x2(__A),
+ (__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512vlintrin.h b/c_headers/avx512vlintrin.h
index 99bb050de4..7e17cff05f 100644
--- a/c_headers/avx512vlintrin.h
+++ b/c_headers/avx512vlintrin.h
@@ -5723,59 +5723,72 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
(__v4df)_mm256_setzero_pd());
}
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__M,
+ (__v4si) _mm_set1_epi32(__A),
+ (__v4si)__O);
+}
-#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
- (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
- (__v4si)(__m128i)(O), \
- (__mmask8)(M)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__M,
+ (__v4si) _mm_set1_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
-#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
- (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(M)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__M,
+ (__v8si) _mm256_set1_epi32(__A),
+ (__v8si)__O);
+}
-#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
- (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
- (__v8si)(__m256i)(O), \
- (__mmask8)(M)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__M,
+ (__v8si) _mm256_set1_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
-#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
- (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(M)); })
#ifdef __x86_64__
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
{
- return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
- __M);
+ return (__m128i) __builtin_ia32_selectq_128(__M,
+ (__v2di) _mm_set1_epi64x(__A),
+ (__v2di) __O);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
{
- return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
- (__v2di)
- _mm_setzero_si128 (),
- __M);
+ return (__m128i) __builtin_ia32_selectq_128(__M,
+ (__v2di) _mm_set1_epi64x(__A),
+ (__v2di) _mm_setzero_si128());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
{
- return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
- __M);
+ return (__m256i) __builtin_ia32_selectq_256(__M,
+ (__v4di) _mm256_set1_epi64x(__A),
+ (__v4di) __O) ;
}
static __inline__ __m256i __DEFAULT_FN_ATTRS
_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
{
- return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
- (__v4di)
- _mm256_setzero_si256 (),
- __M);
+ return (__m256i) __builtin_ia32_selectq_256(__M,
+ (__v4di) _mm256_set1_epi64x(__A),
+ (__v4di) _mm256_setzero_si256());
}
+
#endif
#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
diff --git a/c_headers/clflushoptintrin.h b/c_headers/clflushoptintrin.h
index 60e0ead762..f1f1330234 100644
--- a/c_headers/clflushoptintrin.h
+++ b/c_headers/clflushoptintrin.h
@@ -32,7 +32,7 @@
#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt")))
static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflushopt(char * __m) {
+_mm_clflushopt(void const * __m) {
__builtin_ia32_clflushopt(__m);
}
diff --git a/c_headers/clwbintrin.h b/c_headers/clwbintrin.h
new file mode 100644
index 0000000000..2594a6c387
--- /dev/null
+++ b/c_headers/clwbintrin.h
@@ -0,0 +1,52 @@
+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLWBINTRIN_H
+#define __CLWBINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb")))
+
+/// \brief Writes back to memory the cache line (if modified) that contains the
+/// linear address specified in \a __p from any level of the cache hierarchy in
+/// the cache coherence domain
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
+///
+/// \param __p
+/// A pointer to the memory location used to identify the cache line to be
+/// written back.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clwb(void const *__p) {
+ __builtin_ia32_clwb(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/c_headers/cuda_wrappers/new b/c_headers/cuda_wrappers/new
index b77131af0e..71b7a52363 100644
--- a/c_headers/cuda_wrappers/new
+++ b/c_headers/cuda_wrappers/new
@@ -26,7 +26,6 @@
#include_next <new>
-// Device overrides for placement new and delete.
#pragma push_macro("CUDA_NOEXCEPT")
#if __cplusplus >= 201103L
#define CUDA_NOEXCEPT noexcept
@@ -34,6 +33,55 @@
#define CUDA_NOEXCEPT
#endif
+// Device overrides for non-placement new and delete.
+__device__ inline void *operator new(__SIZE_TYPE__ size) {
+ if (size == 0) {
+ size = 1;
+ }
+ return ::malloc(size);
+}
+__device__ inline void *operator new(__SIZE_TYPE__ size,
+ const std::nothrow_t &) CUDA_NOEXCEPT {
+ return ::operator new(size);
+}
+
+__device__ inline void *operator new[](__SIZE_TYPE__ size) {
+ return ::operator new(size);
+}
+__device__ inline void *operator new[](__SIZE_TYPE__ size,
+ const std::nothrow_t &) {
+ return ::operator new(size);
+}
+
+__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
+ if (ptr) {
+ ::free(ptr);
+ }
+}
+__device__ inline void operator delete(void *ptr,
+ const std::nothrow_t &) CUDA_NOEXCEPT {
+ ::operator delete(ptr);
+}
+
+__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
+ ::operator delete(ptr);
+}
+__device__ inline void operator delete[](void *ptr,
+ const std::nothrow_t &) CUDA_NOEXCEPT {
+ ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+ ::operator delete(ptr);
+}
+__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+ ::operator delete(ptr);
+}
+#endif
+
+// Device overrides for placement new and delete.
__device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
return __ptr;
}
@@ -42,6 +90,7 @@ __device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT
}
__device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
__device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+
#pragma pop_macro("CUDA_NOEXCEPT")
#endif // include guard
diff --git a/c_headers/emmintrin.h b/c_headers/emmintrin.h
index 709815cbb4..3372508a7f 100644
--- a/c_headers/emmintrin.h
+++ b/c_headers/emmintrin.h
@@ -2258,7 +2258,11 @@ _mm_adds_epu16(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu8(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+ typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+ return (__m128i)__builtin_convertvector(
+ ((__builtin_convertvector((__v16qu)__a, __v16hu) +
+ __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
+ >> 1, __v16qu);
}
/// \brief Computes the rounded avarages of corresponding elements of two
@@ -2278,7 +2282,11 @@ _mm_avg_epu8(__m128i __a, __m128i __b)
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_avg_epu16(__m128i __a, __m128i __b)
{
- return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+ typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+ return (__m128i)__builtin_convertvector(
+ ((__builtin_convertvector((__v8hu)__a, __v8su) +
+ __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
+ >> 1, __v8hu);
}
/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
diff --git a/c_headers/float.h b/c_headers/float.h
index 502143d4e4..44d4d05494 100644
--- a/c_headers/float.h
+++ b/c_headers/float.h
@@ -143,4 +143,18 @@
# define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
#endif
+#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
+# define FLT16_MANT_DIG __FLT16_MANT_DIG__
+# define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
+# define FLT16_DIG __FLT16_DIG__
+# define FLT16_MIN_EXP __FLT16_MIN_EXP__
+# define FLT16_MIN_10_EXP __FLT16_MIN_10_EXP__
+# define FLT16_MAX_EXP __FLT16_MAX_EXP__
+# define FLT16_MAX_10_EXP __FLT16_MAX_10_EXP__
+# define FLT16_MAX __FLT16_MAX__
+# define FLT16_EPSILON __FLT16_EPSILON__
+# define FLT16_MIN __FLT16_MIN__
+# define FLT16_TRUE_MIN __FLT16_TRUE_MIN__
+#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
+
#endif /* __FLOAT_H */
diff --git a/c_headers/immintrin.h b/c_headers/immintrin.h
index c5f25bfcb5..d86e0efb82 100644
--- a/c_headers/immintrin.h
+++ b/c_headers/immintrin.h
@@ -58,6 +58,10 @@
#include <clflushoptintrin.h>
#endif
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
+#include <clwbintrin.h>
+#endif
+
#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
#include <avxintrin.h>
#endif
diff --git a/c_headers/intrin.h b/c_headers/intrin.h
index 881d05c0d1..b30aa215a4 100644
--- a/c_headers/intrin.h
+++ b/c_headers/intrin.h
@@ -38,6 +38,10 @@
#include <armintr.h>
#endif
+#if defined(_M_ARM64)
+#include <arm64intr.h>
+#endif
+
/* For the definition of jmp_buf. */
#if __STDC_HOSTED__
#include <setjmp.h>
@@ -828,7 +832,7 @@ _InterlockedCompareExchange_nf(long volatile *_Destination,
__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
return _Comparand;
}
-static __inline__ short __DEFAULT_FN_ATTRS
+static __inline__ long __DEFAULT_FN_ATTRS
_InterlockedCompareExchange_rel(long volatile *_Destination,
long _Exchange, long _Comparand) {
__atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
diff --git a/c_headers/opencl-c.h b/c_headers/opencl-c.h
index 58c8daf3a5..35fb0a82bc 100644
--- a/c_headers/opencl-c.h
+++ b/c_headers/opencl-c.h
@@ -11381,6 +11381,8 @@ half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c);
* For each component of a vector type,
* result[i] = if MSB of c[i] is set ? b[i] : a[i].
* For a scalar type, result = c ? b : a.
+ * b and a must have the same type.
+ * c must have the same number of elements and bits as a.
*/
char __ovld __cnfn select(char a, char b, char c);
uchar __ovld __cnfn select(uchar a, uchar b, char c);
@@ -11394,60 +11396,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, char8 c);
uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);
-short __ovld __cnfn select(short a, short b, char c);
-ushort __ovld __cnfn select(ushort a, ushort b, char c);
-short2 __ovld __cnfn select(short2 a, short2 b, char2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, char3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, char4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, char8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, char16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c);
-int __ovld __cnfn select(int a, int b, char c);
-uint __ovld __cnfn select(uint a, uint b, char c);
-int2 __ovld __cnfn select(int2 a, int2 b, char2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, char3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, char4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, char8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, char16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c);
-long __ovld __cnfn select(long a, long b, char c);
-ulong __ovld __cnfn select(ulong a, ulong b, char c);
-long2 __ovld __cnfn select(long2 a, long2 b, char2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, char3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, char4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, char8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, char16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c);
-float __ovld __cnfn select(float a, float b, char c);
-float2 __ovld __cnfn select(float2 a, float2 b, char2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, char3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, char4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, char8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, char16 c);
-char __ovld __cnfn select(char a, char b, short c);
-uchar __ovld __cnfn select(uchar a, uchar b, short c);
-char2 __ovld __cnfn select(char2 a, char2 b, short2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, short3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, short4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, short8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, short16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c);
+
short __ovld __cnfn select(short a, short b, short c);
ushort __ovld __cnfn select(ushort a, ushort b, short c);
short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
@@ -11460,60 +11409,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, short8 c);
ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);
-int __ovld __cnfn select(int a, int b, short c);
-uint __ovld __cnfn select(uint a, uint b, short c);
-int2 __ovld __cnfn select(int2 a, int2 b, short2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, short3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, short4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, short8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, short16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c);
-long __ovld __cnfn select(long a, long b, short c);
-ulong __ovld __cnfn select(ulong a, ulong b, short c);
-long2 __ovld __cnfn select(long2 a, long2 b, short2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, short3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, short4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, short8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, short16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c);
-float __ovld __cnfn select(float a, float b, short c);
-float2 __ovld __cnfn select(float2 a, float2 b, short2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, short3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, short4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, short8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, short16 c);
-char __ovld __cnfn select(char a, char b, int c);
-uchar __ovld __cnfn select(uchar a, uchar b, int c);
-char2 __ovld __cnfn select(char2 a, char2 b, int2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, int3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, int4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, int8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, int16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c);
-short __ovld __cnfn select(short a, short b, int c);
-ushort __ovld __cnfn select(ushort a, ushort b, int c);
-short2 __ovld __cnfn select(short2 a, short2 b, int2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, int3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, int4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, int8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, int16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c);
+
int __ovld __cnfn select(int a, int b, int c);
uint __ovld __cnfn select(uint a, uint b, int c);
int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
@@ -11526,60 +11422,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, int8 c);
uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
-long __ovld __cnfn select(long a, long b, int c);
-ulong __ovld __cnfn select(ulong a, ulong b, int c);
-long2 __ovld __cnfn select(long2 a, long2 b, int2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, int3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, int4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, int8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, int16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c);
float __ovld __cnfn select(float a, float b, int c);
float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
float16 __ovld __cnfn select(float16 a, float16 b, int16 c);
-char __ovld __cnfn select(char a, char b, long c);
-uchar __ovld __cnfn select(uchar a, uchar b, long c);
-char2 __ovld __cnfn select(char2 a, char2 b, long2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, long3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, long4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, long8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, long16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c);
-short __ovld __cnfn select(short a, short b, long c);
-ushort __ovld __cnfn select(ushort a, ushort b, long c);
-short2 __ovld __cnfn select(short2 a, short2 b, long2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, long3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, long4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, long8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, long16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c);
-int __ovld __cnfn select(int a, int b, long c);
-uint __ovld __cnfn select(uint a, uint b, long c);
-int2 __ovld __cnfn select(int2 a, int2 b, long2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, long3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, long4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, long8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, long16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c);
+
long __ovld __cnfn select(long a, long b, long c);
ulong __ovld __cnfn select(ulong a, ulong b, long c);
long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
@@ -11592,12 +11441,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, long8 c);
ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);
-float __ovld __cnfn select(float a, float b, long c);
-float2 __ovld __cnfn select(float2 a, float2 b, long2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, long3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, long4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, long8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, long16 c);
+
char __ovld __cnfn select(char a, char b, uchar c);
uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
@@ -11610,60 +11454,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c);
uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);
-short __ovld __cnfn select(short a, short b, uchar c);
-ushort __ovld __cnfn select(ushort a, ushort b, uchar c);
-short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c);
-int __ovld __cnfn select(int a, int b, uchar c);
-uint __ovld __cnfn select(uint a, uint b, uchar c);
-int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c);
-long __ovld __cnfn select(long a, long b, uchar c);
-ulong __ovld __cnfn select(ulong a, ulong b, uchar c);
-long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c);
-float __ovld __cnfn select(float a, float b, uchar c);
-float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c);
-char __ovld __cnfn select(char a, char b, ushort c);
-uchar __ovld __cnfn select(uchar a, uchar b, ushort c);
-char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c);
+
short __ovld __cnfn select(short a, short b, ushort c);
ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
@@ -11676,60 +11467,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c);
ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);
-int __ovld __cnfn select(int a, int b, ushort c);
-uint __ovld __cnfn select(uint a, uint b, ushort c);
-int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c);
-long __ovld __cnfn select(long a, long b, ushort c);
-ulong __ovld __cnfn select(ulong a, ulong b, ushort c);
-long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c);
-float __ovld __cnfn select(float a, float b, ushort c);
-float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c);
-char __ovld __cnfn select(char a, char b, uint c);
-uchar __ovld __cnfn select(uchar a, uchar b, uint c);
-char2 __ovld __cnfn select(char2 a, char2 b, uint2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, uint3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, uint4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, uint8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, uint16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c);
-short __ovld __cnfn select(short a, short b, uint c);
-ushort __ovld __cnfn select(ushort a, ushort b, uint c);
-short2 __ovld __cnfn select(short2 a, short2 b, uint2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, uint3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, uint4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, uint8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, uint16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c);
+
int __ovld __cnfn select(int a, int b, uint c);
uint __ovld __cnfn select(uint a, uint b, uint c);
int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
@@ -11742,60 +11480,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, uint8 c);
uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
-long __ovld __cnfn select(long a, long b, uint c);
-ulong __ovld __cnfn select(ulong a, ulong b, uint c);
-long2 __ovld __cnfn select(long2 a, long2 b, uint2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, uint3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, uint4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, uint8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, uint16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c);
float __ovld __cnfn select(float a, float b, uint c);
float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);
-char __ovld __cnfn select(char a, char b, ulong c);
-uchar __ovld __cnfn select(uchar a, uchar b, ulong c);
-char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c);
-short __ovld __cnfn select(short a, short b, ulong c);
-ushort __ovld __cnfn select(ushort a, ushort b, ulong c);
-short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c);
-int __ovld __cnfn select(int a, int b, ulong c);
-uint __ovld __cnfn select(uint a, uint b, ulong c);
-int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c);
+
long __ovld __cnfn select(long a, long b, ulong c);
ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
@@ -11808,12 +11499,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c);
ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);
-float __ovld __cnfn select(float a, float b, ulong c);
-float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c);
+
#ifdef cl_khr_fp64
double __ovld __cnfn select(double a, double b, long c);
double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
@@ -13141,13 +12827,14 @@ void __ovld __conv barrier(cl_mem_fence_flags flags);
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-typedef enum memory_scope
-{
- memory_scope_work_item,
- memory_scope_work_group,
- memory_scope_device,
- memory_scope_all_svm_devices,
- memory_scope_sub_group
+typedef enum memory_scope {
+ memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+ memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+ memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+ memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+ memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
} memory_scope;
void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
@@ -13952,11 +13639,11 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum memory_order
{
- memory_order_relaxed,
- memory_order_acquire,
- memory_order_release,
- memory_order_acq_rel,
- memory_order_seq_cst
+ memory_order_relaxed = __ATOMIC_RELAXED,
+ memory_order_acquire = __ATOMIC_ACQUIRE,
+ memory_order_release = __ATOMIC_RELEASE,
+ memory_order_acq_rel = __ATOMIC_ACQ_REL,
+ memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;
// double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
diff --git a/c_headers/unwind.h b/c_headers/unwind.h
index 4f74a34787..345fa4d0c1 100644
--- a/c_headers/unwind.h
+++ b/c_headers/unwind.h
@@ -76,7 +76,13 @@ typedef intptr_t _sleb128_t;
typedef uintptr_t _uleb128_t;
struct _Unwind_Context;
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+struct _Unwind_Control_Block;
+typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
+#else
struct _Unwind_Exception;
+typedef struct _Unwind_Exception _Unwind_Exception;
+#endif
typedef enum {
_URC_NO_REASON = 0,
#if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
@@ -109,8 +115,42 @@ typedef enum {
} _Unwind_Action;
typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
- struct _Unwind_Exception *);
-
+ _Unwind_Exception *);
+
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+typedef uint32_t _Unwind_EHT_Header;
+
+struct _Unwind_Control_Block {
+ uint64_t exception_class;
+ void (*exception_cleanup)(_Unwind_Reason_Code, _Unwind_Control_Block *);
+ /* unwinder cache (private fields for the unwinder's use) */
+ struct {
+ uint32_t reserved1; /* forced unwind stop function, 0 if not forced */
+ uint32_t reserved2; /* personality routine */
+ uint32_t reserved3; /* callsite */
+ uint32_t reserved4; /* forced unwind stop argument */
+ uint32_t reserved5;
+ } unwinder_cache;
+ /* propagation barrier cache (valid after phase 1) */
+ struct {
+ uint32_t sp;
+ uint32_t bitpattern[5];
+ } barrier_cache;
+ /* cleanup cache (preserved over cleanup) */
+ struct {
+ uint32_t bitpattern[4];
+ } cleanup_cache;
+ /* personality cache (for personality's benefit) */
+ struct {
+ uint32_t fnstart; /* function start address */
+ _Unwind_EHT_Header *ehtp; /* pointer to EHT entry header word */
+ uint32_t additional; /* additional data */
+ uint32_t reserved1;
+ } pr_cache;
+ long long int : 0; /* force alignment of next item to 8-byte boundary */
+} __attribute__((__aligned__(8)));
+#else
struct _Unwind_Exception {
_Unwind_Exception_Class exception_class;
_Unwind_Exception_Cleanup_Fn exception_cleanup;
@@ -120,23 +160,24 @@ struct _Unwind_Exception {
* aligned". GCC has interpreted this to mean "use the maximum useful
* alignment for the target"; so do we. */
} __attribute__((__aligned__));
+#endif
typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
_Unwind_Exception_Class,
- struct _Unwind_Exception *,
+ _Unwind_Exception *,
struct _Unwind_Context *,
void *);
-typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
- int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
- struct _Unwind_Context *);
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(int, _Unwind_Action,
+ _Unwind_Exception_Class,
+ _Unwind_Exception *,
+ struct _Unwind_Context *);
typedef _Unwind_Personality_Fn __personality_routine;
typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
void *);
-#if defined(__arm__) && !defined(__APPLE__)
-
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
typedef enum {
_UVRSC_CORE = 0, /* integer register */
_UVRSC_VFP = 1, /* vfp */
@@ -158,14 +199,12 @@ typedef enum {
_UVRSR_FAILED = 2
} _Unwind_VRS_Result;
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
typedef uint32_t _Unwind_State;
#define _US_VIRTUAL_UNWIND_FRAME ((_Unwind_State)0)
#define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
#define _US_UNWIND_FRAME_RESUME ((_Unwind_State)2)
#define _US_ACTION_MASK ((_Unwind_State)3)
#define _US_FORCE_UNWIND ((_Unwind_State)8)
-#endif
_Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
_Unwind_VRS_RegClass __regclass,
@@ -224,13 +263,12 @@ _Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
/* DWARF EH functions; currently not available on Darwin/ARM */
#if !defined(__APPLE__) || !defined(__arm__)
-
-_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
- _Unwind_Stop_Fn, void *);
-void _Unwind_DeleteException(struct _Unwind_Exception *);
-void _Unwind_Resume(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn,
+ void *);
+void _Unwind_DeleteException(_Unwind_Exception *);
+void _Unwind_Resume(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(_Unwind_Exception *);
#endif
@@ -241,11 +279,11 @@ typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
-_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(_Unwind_Exception *,
_Unwind_Stop_Fn, void *);
-void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+void _Unwind_SjLj_Resume(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(_Unwind_Exception *);
void *_Unwind_FindEnclosingFunction(void *);