update to llvm master

author: Andrew Kelley <superjoe30@gmail.com> 2017-10-23 21:43:18 -0400
committer: Andrew Kelley <superjoe30@gmail.com> 2017-11-02 21:54:24 -0400
commit: 94ec2190f8d8c41d19b668511bf31fae32bcd095 (patch)
tree: c5ab9b20fbaf8f017661f9a159082d1ecaf9f943 /c_headers
parent: abff1b688420eb30d98145d8bc48e7d08f259885 (diff)
download: zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.tar.gz
zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.zip
19 files changed, 601 insertions, 530 deletions
diff --git a/c_headers/__clang_cuda_intrinsics.h b/c_headers/__clang_cuda_intrinsics.h
index b43ce21d0b..bc5b876577 100644
--- a/c_headers/__clang_cuda_intrinsics.h
+++ b/c_headers/__clang_cuda_intrinsics.h
@@ -92,6 +92,130 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f);
 
 #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
 
+#if CUDA_VERSION >= 9000
+#if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300)
+// __shfl_sync_* variants available in CUDA-9
+#pragma push_macro("__MAKE_SYNC_SHUFFLES")
+#define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic,       \
+                             __Mask)                                           \
+  inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \
+                                 int __width = warpSize) {                     \
+    return __IntIntrinsic(__mask, __val, __offset,                             \
+                          ((warpSize - __width) << 8) | (__Mask));             \
+  }                                                                            \
+  inline __device__ float __FnName(unsigned int __mask, float __val,           \
+                                   int __offset, int __width = warpSize) {     \
+    return __FloatIntrinsic(__mask, __val, __offset,                           \
+                            ((warpSize - __width) << 8) | (__Mask));           \
+  }                                                                            \
+  inline __device__ unsigned int __FnName(unsigned int __mask,                 \
+                                          unsigned int __val, int __offset,    \
+                                          int __width = warpSize) {            \
+    return static_cast<unsigned int>(                                          \
+        ::__FnName(__mask, static_cast<int>(__val), __offset, __width));       \
+  }                                                                            \
+  inline __device__ long long __FnName(unsigned int __mask, long long __val,   \
+                                       int __offset, int __width = warpSize) { \
+    struct __Bits {                                                            \
+      int __a, __b;                                                            \
+    };                                                                         \
+    _Static_assert(sizeof(__val) == sizeof(__Bits));                           \
+    _Static_assert(sizeof(__Bits) == 2 * sizeof(int));                         \
+    __Bits __tmp;                                                              \
+    memcpy(&__val, &__tmp, sizeof(__val));                                     \
+    __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width);              \
+    __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width);              \
+    long long __ret;                                                           \
+    memcpy(&__ret, &__tmp, sizeof(__tmp));                                     \
+    return __ret;                                                              \
+  }                                                                            \
+  inline __device__ unsigned long long __FnName(                               \
+      unsigned int __mask, unsigned long long __val, int __offset,             \
+      int __width = warpSize) {                                                \
+    return static_cast<unsigned long long>(::__FnName(                         \
+        __mask, static_cast<unsigned long long>(__val), __offset, __width));   \
+  }                                                                            \
+  inline __device__ double __FnName(unsigned int __mask, double __val,         \
+                                    int __offset, int __width = warpSize) {    \
+    long long __tmp;                                                           \
+    _Static_assert(sizeof(__tmp) == sizeof(__val));                            \
+    memcpy(&__tmp, &__val, sizeof(__val));                                     \
+    __tmp = ::__FnName(__mask, __tmp, __offset, __width);                      \
+    double __ret;                                                              \
+    memcpy(&__ret, &__tmp, sizeof(__ret));                                     \
+    return __ret;                                                              \
+  }
+__MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32,
+                     __nvvm_shfl_sync_idx_f32, 0x1f);
+// We use 0 rather than 31 as our mask, because shfl.up applies to lanes >=
+// maxLane.
+__MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32,
+                     __nvvm_shfl_sync_up_f32, 0);
+__MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32,
+                     __nvvm_shfl_sync_down_f32, 0x1f);
+__MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32,
+                     __nvvm_shfl_sync_bfly_f32, 0x1f);
+#pragma pop_macro("__MAKE_SYNC_SHUFFLES")
+
+inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) {
+  return __nvvm_bar_warp_sync(mask);
+}
+
+inline __device__ void __barrier_sync(unsigned int id) {
+  __nvvm_barrier_sync(id);
+}
+
+inline __device__ void __barrier_sync_count(unsigned int id,
+                                            unsigned int count) {
+  __nvvm_barrier_sync_cnt(id, count);
+}
+
+inline __device__ int __all_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+inline __device__ int __any_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+inline __device__ int __uni_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+// Define __match* builtins CUDA-9 headers expect to see.
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+inline __device__ unsigned int __match32_any_sync(unsigned int mask,
+                                                  unsigned int value) {
+  return __nvvm_match_any_sync_i32(mask, value);
+}
+
+inline __device__ unsigned long long
+__match64_any_sync(unsigned int mask, unsigned long long value) {
+  return __nvvm_match_any_sync_i64(mask, value);
+}
+
+inline __device__ unsigned int
+__match32_all_sync(unsigned int mask, unsigned int value, int *pred) {
+  return __nvvm_match_all_sync_i32p(mask, value, pred);
+}
+
+inline __device__ unsigned long long
+__match64_all_sync(unsigned int mask, unsigned long long value, int *pred) {
+  return __nvvm_match_all_sync_i64p(mask, value, pred);
+}
+#include "crt/sm_70_rt.hpp"
+
+#endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+#endif // __CUDA_VERSION >= 9000
+
 // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}.
 
 // Prevent the vanilla sm_32 intrinsics header from being included.
diff --git a/c_headers/__clang_cuda_runtime_wrapper.h b/c_headers/__clang_cuda_runtime_wrapper.h
index 931d44b696..b8ffc2ce9f 100644
--- a/c_headers/__clang_cuda_runtime_wrapper.h
+++ b/c_headers/__clang_cuda_runtime_wrapper.h
@@ -62,7 +62,7 @@
 #include "cuda.h"
 #if !defined(CUDA_VERSION)
 #error "cuda.h did not define CUDA_VERSION"
-#elif CUDA_VERSION < 7000 || CUDA_VERSION > 8000
+#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000
 #error "Unsupported CUDA version!"
 #endif
 
@@ -86,7 +86,11 @@
 #define __COMMON_FUNCTIONS_H__
 
 #undef __CUDACC__
+#if CUDA_VERSION < 9000
 #define __CUDABE__
+#else
+#define __CUDA_LIBDEVICE__
+#endif
 // Disables definitions of device-side runtime support stubs in
 // cuda_device_runtime_api.h
 #include "driver_types.h"
@@ -94,6 +98,7 @@
 #include "host_defines.h"
 
 #undef __CUDABE__
+#undef __CUDA_LIBDEVICE__
 #define __CUDACC__
 #include "cuda_runtime.h"
 
@@ -105,7 +110,9 @@
 #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
 #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
 
+#if CUDA_VERSION < 9000
 #include "crt/device_runtime.h"
+#endif
 #include "crt/host_runtime.h"
 // device_runtime.h defines __cxa_* macros that will conflict with
 // cxxabi.h.
@@ -166,7 +173,18 @@ inline __host__ double __signbitd(double x) {
 // __device__.
 #pragma push_macro("__forceinline__")
 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
+
+#pragma push_macro("__float2half_rn")
+#if CUDA_VERSION >= 9000
+// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in
+// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in
+// device_functions.hpp out of the way.
+#define __float2half_rn  __float2half_rn_disabled
+#endif
+
 #include "device_functions.hpp"
+#pragma pop_macro("__float2half_rn")
+
 
 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
@@ -247,7 +265,17 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); }
 #pragma push_macro("__GNUC__")
 #undef __GNUC__
 #define signbit __ignored_cuda_signbit
+
+// CUDA-9 omits device-side definitions of some math functions if it sees
+// include guard from math.h wrapper from libstdc++. We have to undo the header
+// guard temporarily to get the definitions we need.
+#pragma push_macro("_GLIBCXX_MATH_H")
+#if CUDA_VERSION >= 9000
+#undef _GLIBCXX_MATH_H
+#endif
+
 #include "math_functions.hpp"
+#pragma pop_macro("_GLIBCXX_MATH_H")
 #pragma pop_macro("__GNUC__")
 #pragma pop_macro("signbit")
 
diff --git a/c_headers/arm64intr.h b/c_headers/arm64intr.h
new file mode 100644
index 0000000000..be52283618
--- /dev/null
+++ b/c_headers/arm64intr.h
@@ -0,0 +1,49 @@
+/*===---- arm64intr.h - ARM64 Windows intrinsics -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Only include this if we're compiling for the windows platform. */
+#ifndef _MSC_VER
+#include_next <arm64intr.h>
+#else
+
+#ifndef __ARM64INTR_H
+#define __ARM64INTR_H
+
+typedef enum
+{
+  _ARM64_BARRIER_SY    = 0xF,
+  _ARM64_BARRIER_ST    = 0xE,
+  _ARM64_BARRIER_LD    = 0xD,
+  _ARM64_BARRIER_ISH   = 0xB,
+  _ARM64_BARRIER_ISHST = 0xA,
+  _ARM64_BARRIER_ISHLD = 0x9,
+  _ARM64_BARRIER_NSH   = 0x7,
+  _ARM64_BARRIER_NSHST = 0x6,
+  _ARM64_BARRIER_NSHLD = 0x5,
+  _ARM64_BARRIER_OSH   = 0x3,
+  _ARM64_BARRIER_OSHST = 0x2,
+  _ARM64_BARRIER_OSHLD = 0x1
+} _ARM64INTR_BARRIER_TYPE;
+
+#endif /* __ARM64INTR_H */
+#endif /* _MSC_VER */
diff --git a/c_headers/avx2intrin.h b/c_headers/avx2intrin.h
index 576f761b25..caf4ced920 100644
--- a/c_headers/avx2intrin.h
+++ b/c_headers/avx2intrin.h
@@ -145,13 +145,21 @@ _mm256_andnot_si256(__m256i __a, __m256i __b)
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_avg_epu8(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+  typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+  return (__m256i)__builtin_convertvector(
+               ((__builtin_convertvector((__v32qu)__a, __v32hu) +
+                 __builtin_convertvector((__v32qu)__b, __v32hu)) + 1)
+                 >> 1, __v32qu);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_avg_epu16(__m256i __a, __m256i __b)
 {
-  return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+  typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+  return (__m256i)__builtin_convertvector(
+               ((__builtin_convertvector((__v16hu)__a, __v16su) +
+                 __builtin_convertvector((__v16hu)__b, __v16su)) + 1)
+                 >> 1, __v16hu);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512bwintrin.h b/c_headers/avx512bwintrin.h
index 41958b7214..53da5869d3 100644
--- a/c_headers/avx512bwintrin.h
+++ b/c_headers/avx512bwintrin.h
@@ -706,57 +706,55 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_avg_epu8 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) -1);
+  typedef unsigned short __v64hu __attribute__((__vector_size__(128)));
+  return (__m512i)__builtin_convertvector(
+              ((__builtin_convertvector((__v64qu) __A, __v64hu) +
+                __builtin_convertvector((__v64qu) __B, __v64hu)) + 1)
+                >> 1, __v64qu);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
           __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) __W,
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+              (__v64qi)_mm512_avg_epu8(__A, __B),
+              (__v64qi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
-              (__v64qi) __B,
-              (__v64qi) _mm512_setzero_qi(),
-              (__mmask64) __U);
+  return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+              (__v64qi)_mm512_avg_epu8(__A, __B),
+              (__v64qi)_mm512_setzero_qi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_avg_epu16 (__m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) -1);
+  typedef unsigned int __v32su __attribute__((__vector_size__(128)));
+  return (__m512i)__builtin_convertvector(
+              ((__builtin_convertvector((__v32hu) __A, __v32su) +
+                __builtin_convertvector((__v32hu) __B, __v32su)) + 1)
+                >> 1, __v32hu);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
            __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) __W,
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+              (__v32hi)_mm512_avg_epu16(__A, __B),
+              (__v32hi)__W);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
 {
-  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
-              (__v32hi) __B,
-              (__v32hi) _mm512_setzero_hi(),
-              (__mmask32) __U);
+  return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+              (__v32hi)_mm512_avg_epu16(__A, __B),
+              (__v32hi) _mm512_setzero_hi());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -2028,18 +2026,17 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
-                 (__v64qi) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectb_512(__M,
+                                              (__v64qi)_mm512_set1_epi8(__A),
+                                              (__v64qi) __O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
-                 (__v64qi)
-                 _mm512_setzero_qi(),
-                 __M);
+  return (__m512i) __builtin_ia32_selectb_512(__M,
+                                              (__v64qi) _mm512_set1_epi8(__A),
+                                              (__v64qi) _mm512_setzero_si512());
 }
 
 static __inline__ __mmask64 __DEFAULT_FN_ATTRS
@@ -2219,17 +2216,17 @@ _mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
-                 (__v32hi) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectw_512(__M,
+                                              (__v32hi) _mm512_set1_epi16(__A),
+                                              (__v32hi) __O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
-                 (__v32hi) _mm512_setzero_hi(),
-                 __M);
+  return (__m512i) __builtin_ia32_selectw_512(__M,
+                                              (__v32hi) _mm512_set1_epi16(__A),
+                                              (__v32hi) _mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512dqintrin.h b/c_headers/avx512dqintrin.h
index 4fd1add773..2c431d9740 100644
--- a/c_headers/avx512dqintrin.h
+++ b/c_headers/avx512dqintrin.h
@@ -973,25 +973,26 @@ _mm512_movepi64_mask (__m512i __A)
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_broadcast_f32x2 (__m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
-                (__v16sf)_mm512_undefined_ps(),
-                (__mmask16) -1);
+  return (__m512)__builtin_shufflevector((__v4sf)__A,
+                                         (__v4sf)_mm_undefined_ps(),
+                                         0, 1, 0, 1, 0, 1, 0, 1,
+                                         0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
-                (__v16sf)
-                __O, __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                             (__v16sf)_mm512_broadcast_f32x2(__A),
+                                             (__v16sf)__O);
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
 {
-  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
-                (__v16sf)_mm512_setzero_ps (),
-                __M);
+  return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+                                             (__v16sf)_mm512_broadcast_f32x2(__A),
+                                             (__v16sf)_mm512_setzero_ps());
 }
 
 static __inline__ __m512 __DEFAULT_FN_ATTRS
@@ -1044,25 +1045,26 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_broadcast_i32x2 (__m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
-                 (__v16si)_mm512_setzero_si512(),
-                 (__mmask16) -1);
+  return (__m512i)__builtin_shufflevector((__v4si)__A,
+                                          (__v4si)_mm_undefined_si128(),
+                                          0, 1, 0, 1, 0, 1, 0, 1,
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
-                 (__v16si)
-                 __O, __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_broadcast_i32x2(__A),
+                                             (__v16si)__O);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
 {
-  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
-                 (__v16si)_mm512_setzero_si512 (),
-                 __M);
+  return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+                                             (__v16si)_mm512_broadcast_i32x2(__A),
+                                             (__v16si)_mm512_setzero_si512());
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512fintrin.h b/c_headers/avx512fintrin.h
index 4ce6945311..247ac879ea 100644
--- a/c_headers/avx512fintrin.h
+++ b/c_headers/avx512fintrin.h
@@ -258,30 +258,6 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
                                              (__v8di) _mm512_setzero_si512());
 }
 
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
-                 (__v16si)
-                 _mm512_setzero_si512 (),
-                 __M);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
-#ifdef __x86_64__
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
-                 (__v8di)
-                 _mm512_setzero_si512 (),
-                 __M);
-#else
-  return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
-                 (__v8di)
-                 _mm512_setzero_si512 (),
-                 __M);
-#endif
-}
 
 static __inline __m512 __DEFAULT_FN_ATTRS
 _mm512_setzero_ps(void)
@@ -341,11 +317,29 @@ _mm512_set1_epi32(int __s)
 }
 
 static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A) 
+{
+  return (__m512i)__builtin_ia32_selectd_512(__M, 
+                                             (__v16si)_mm512_set1_epi32(__A),
+                                             (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS
 _mm512_set1_epi64(long long __d)
 {
   return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
 }
 
+#ifdef __x86_64__
+static __inline __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+  return (__m512i)__builtin_ia32_selectq_512(__M,
+                                             (__v8di)_mm512_set1_epi64(__A),
+                                             (__v8di)_mm512_setzero_si512());
+}
+#endif
+
 static __inline__ __m512 __DEFAULT_FN_ATTRS
 _mm512_broadcastss_ps(__m128 __A)
 {
@@ -9040,7 +9034,7 @@ _mm512_stream_si512 (__m512i * __P, __m512i __A)
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_stream_load_si512 (void *__P)
+_mm512_stream_load_si512 (void const *__P)
 {
   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
   return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
@@ -9742,16 +9736,18 @@ _mm_cvtu64_ss (__m128 __A, unsigned long long __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectd_512(__M,
+                                              (__v16si) _mm512_set1_epi32(__A),
+                                              (__v16si) __O);
 }
 
 #ifdef __x86_64__
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
 {
-  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
-                 __M);
+  return (__m512i) __builtin_ia32_selectq_512(__M,
+                                              (__v8di) _mm512_set1_epi64(__A),
+                                              (__v8di) __O);
 }
 #endif
 
diff --git a/c_headers/avx512vlbwintrin.h b/c_headers/avx512vlbwintrin.h
index 3b58d04339..4ab785bdbb 100644
--- a/c_headers/avx512vlbwintrin.h
+++ b/c_headers/avx512vlbwintrin.h
@@ -2660,35 +2660,33 @@ _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
-                 (__v16qi) __O,
-                 __M);
+  return (__m128i) __builtin_ia32_selectb_128(__M,
+                                              (__v16qi) _mm_set1_epi8(__A),
+                                              (__v16qi) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi8 (__mmask16 __M, char __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
-                 (__v16qi)
-                 _mm_setzero_si128 (),
-                 __M);
+ return (__m128i) __builtin_ia32_selectb_128(__M,
+                                             (__v16qi) _mm_set1_epi8(__A),
+                                             (__v16qi) _mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
-                 (__v32qi) __O,
-                 __M);
+  return (__m256i) __builtin_ia32_selectb_256(__M,
+                                              (__v32qi) _mm256_set1_epi8(__A),
+                                              (__v32qi) __O);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
-                 (__v32qi)
-                 _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i) __builtin_ia32_selectb_256(__M,
+                                              (__v32qi) _mm256_set1_epi8(__A),
+                                              (__v32qi) _mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
@@ -3025,33 +3023,33 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
-                 (__v16hi) __O,
-                 __M);
+  return (__m256i) __builtin_ia32_selectw_256 (__M,
+                                               (__v16hi) _mm256_set1_epi16(__A),
+                                               (__v16hi) __O);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
-                 (__v16hi) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i) __builtin_ia32_selectw_256(__M,
+                                              (__v16hi)_mm256_set1_epi16(__A),
+                                              (__v16hi) _mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
-                 (__v8hi) __O,
-                 __M);
+  return (__m128i) __builtin_ia32_selectw_128(__M,
+                                              (__v8hi) _mm_set1_epi16(__A),
+                                              (__v8hi) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi16 (__mmask8 __M, short __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
-                 (__v8hi) _mm_setzero_si128 (),
-                 __M);
+  return (__m128i) __builtin_ia32_selectw_128(__M,
+                                              (__v8hi) _mm_set1_epi16(__A),
+                                              (__v8hi) _mm_setzero_si128());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512vldqintrin.h b/c_headers/avx512vldqintrin.h
index aecd7df34d..d80df9eaff 100644
--- a/c_headers/avx512vldqintrin.h
+++ b/c_headers/avx512vldqintrin.h
@@ -978,25 +978,25 @@ _mm256_movepi64_mask (__m256i __A)
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_broadcast_f32x2 (__m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
-                (__v8sf)_mm256_undefined_ps(),
-                (__mmask8) -1);
+  return (__m256)__builtin_shufflevector((__v4sf)__A,
+                                         (__v4sf)_mm_undefined_ps(),
+                                         0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
-                (__v8sf) __O,
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                             (__v8sf)_mm256_broadcast_f32x2(__A),
+                                             (__v8sf)__O);
 }
 
 static __inline__ __m256 __DEFAULT_FN_ATTRS
 _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
 {
-  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
-                (__v8sf) _mm256_setzero_ps (),
-                __M);
+  return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+                                             (__v8sf)_mm256_broadcast_f32x2(__A),
+                                             (__v8sf)_mm256_setzero_ps());
 }
 
 static __inline__ __m256d __DEFAULT_FN_ATTRS
@@ -1025,49 +1025,49 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_broadcast_i32x2 (__m128i __A)
 {
-  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
-                 (__v4si)_mm_undefined_si128(),
-                 (__mmask8) -1);
+  return (__m128i)__builtin_shufflevector((__v4si)__A,
+                                          (__v4si)_mm_undefined_si128(),
+                                          0, 1, 0, 1);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
-                 (__v4si) __O,
-                 __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_broadcast_i32x2(__A),
+                                             (__v4si)__O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
 {
-  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) __A,
-                 (__v4si) _mm_setzero_si128 (),
-                 __M);
+  return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+                                             (__v4si)_mm_broadcast_i32x2(__A),
+                                             (__v4si)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_broadcast_i32x2 (__m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
-                 (__v8si)_mm256_undefined_si256(),
-                 (__mmask8) -1);
+  return (__m256i)__builtin_shufflevector((__v4si)__A,
+                                          (__v4si)_mm_undefined_si128(),
+                                          0, 1, 0, 1, 0, 1, 0, 1);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
-                 (__v8si) __O,
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_broadcast_i32x2(__A),
+                                             (__v8si)__O);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
 {
-  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) __A,
-                 (__v8si) _mm256_setzero_si256 (),
-                 __M);
+  return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+                                             (__v8si)_mm256_broadcast_i32x2(__A),
+                                             (__v8si)_mm256_setzero_si256());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
diff --git a/c_headers/avx512vlintrin.h b/c_headers/avx512vlintrin.h
index 99bb050de4..7e17cff05f 100644
--- a/c_headers/avx512vlintrin.h
+++ b/c_headers/avx512vlintrin.h
@@ -5723,59 +5723,72 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
                                               (__v4df)_mm256_setzero_pd());
 }
 
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
+{
+   return (__m128i)__builtin_ia32_selectd_128(__M,
+                                              (__v4si) _mm_set1_epi32(__A),
+                                              (__v4si)__O);
+}
 
-#define _mm_mask_set1_epi32(O, M, A) __extension__ ({ \
-  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
-                                                  (__v4si)(__m128i)(O), \
-                                                  (__mmask8)(M)); })
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+   return (__m128i)__builtin_ia32_selectd_128(__M,
+                                              (__v4si) _mm_set1_epi32(__A),
+                                              (__v4si)_mm_setzero_si128());
+}
 
-#define _mm_maskz_set1_epi32(M, A) __extension__ ({ \
-  (__m128i)__builtin_ia32_pbroadcastd128_gpr_mask((int)(A), \
-                                                  (__v4si)_mm_setzero_si128(), \
-                                                  (__mmask8)(M)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
+{
+   return (__m256i)__builtin_ia32_selectd_256(__M,
+                                              (__v8si) _mm256_set1_epi32(__A),
+                                              (__v8si)__O);
+}
 
-#define _mm256_mask_set1_epi32(O, M, A) __extension__ ({ \
-  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
-                                                  (__v8si)(__m256i)(O), \
-                                                  (__mmask8)(M)); })
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+   return (__m256i)__builtin_ia32_selectd_256(__M,
+                                              (__v8si) _mm256_set1_epi32(__A),
+                                              (__v8si)_mm256_setzero_si256());
+}
 
-#define _mm256_maskz_set1_epi32(M, A) __extension__ ({ \
-  (__m256i)__builtin_ia32_pbroadcastd256_gpr_mask((int)(A), \
-                                                  (__v8si)_mm256_setzero_si256(), \
-                                                  (__mmask8)(M)); })
 
 #ifdef __x86_64__
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
-                 __M);
+  return (__m128i) __builtin_ia32_selectq_128(__M,
+                                              (__v2di) _mm_set1_epi64x(__A),
+                                              (__v2di) __O);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A,
-                 (__v2di)
-                 _mm_setzero_si128 (),
-                 __M);
+  return (__m128i) __builtin_ia32_selectq_128(__M,
+                                              (__v2di) _mm_set1_epi64x(__A),
+                                              (__v2di) _mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
-                 __M);
+  return (__m256i) __builtin_ia32_selectq_256(__M,
+                                              (__v4di) _mm256_set1_epi64x(__A),
+                                              (__v4di) __O) ;
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS
 _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
 {
-  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
-                 (__v4di)
-                 _mm256_setzero_si256 (),
-                 __M);
+   return (__m256i) __builtin_ia32_selectq_256(__M,
+                                               (__v4di) _mm256_set1_epi64x(__A),
+                                               (__v4di) _mm256_setzero_si256());
 }
+  
 #endif
 
 #define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \
diff --git a/c_headers/clflushoptintrin.h b/c_headers/clflushoptintrin.h
index 60e0ead762..f1f1330234 100644
--- a/c_headers/clflushoptintrin.h
+++ b/c_headers/clflushoptintrin.h
@@ -32,7 +32,7 @@
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clflushopt")))
 
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflushopt(char * __m) {
+_mm_clflushopt(void const * __m) {
   __builtin_ia32_clflushopt(__m);
 }
 
diff --git a/c_headers/clwbintrin.h b/c_headers/clwbintrin.h
new file mode 100644
index 0000000000..2594a6c387
--- /dev/null
+++ b/c_headers/clwbintrin.h
@@ -0,0 +1,52 @@
+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLWBINTRIN_H
+#define __CLWBINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__,  __target__("clwb")))
+
+/// \brief Writes back to memory the cache line (if modified) that contains the
+/// linear address specified in \a __p from any level of the cache hierarchy in
+/// the cache coherence domain
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    written back.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clwb(void const *__p) {
+  __builtin_ia32_clwb(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
diff --git a/c_headers/cuda_wrappers/new b/c_headers/cuda_wrappers/new
index b77131af0e..71b7a52363 100644
--- a/c_headers/cuda_wrappers/new
+++ b/c_headers/cuda_wrappers/new
@@ -26,7 +26,6 @@
 
 #include_next <new>
 
-// Device overrides for placement new and delete.
 #pragma push_macro("CUDA_NOEXCEPT")
 #if __cplusplus >= 201103L
 #define CUDA_NOEXCEPT noexcept
@@ -34,6 +33,55 @@
 #define CUDA_NOEXCEPT
 #endif
 
+// Device overrides for non-placement new and delete.
+__device__ inline void *operator new(__SIZE_TYPE__ size) {
+  if (size == 0) {
+    size = 1;
+  }
+  return ::malloc(size);
+}
+__device__ inline void *operator new(__SIZE_TYPE__ size,
+                                     const std::nothrow_t &) CUDA_NOEXCEPT {
+  return ::operator new(size);
+}
+
+__device__ inline void *operator new[](__SIZE_TYPE__ size) {
+  return ::operator new(size);
+}
+__device__ inline void *operator new[](__SIZE_TYPE__ size,
+                                       const std::nothrow_t &) {
+  return ::operator new(size);
+}
+
+__device__ inline void operator delete(void* ptr) CUDA_NOEXCEPT {
+  if (ptr) {
+    ::free(ptr);
+  }
+}
+__device__ inline void operator delete(void *ptr,
+                                       const std::nothrow_t &) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+__device__ inline void operator delete[](void* ptr) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+__device__ inline void operator delete[](void *ptr,
+                                         const std::nothrow_t &) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+
+// Sized delete, C++14 only.
+#if __cplusplus >= 201402L
+__device__ void operator delete(void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+__device__ void operator delete[](void *ptr, __SIZE_TYPE__ size) CUDA_NOEXCEPT {
+  ::operator delete(ptr);
+}
+#endif
+
+// Device overrides for placement new and delete.
 __device__ inline void *operator new(__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT {
   return __ptr;
 }
@@ -42,6 +90,7 @@ __device__ inline void *operator new[](__SIZE_TYPE__, void *__ptr) CUDA_NOEXCEPT
 }
 __device__ inline void operator delete(void *, void *) CUDA_NOEXCEPT {}
 __device__ inline void operator delete[](void *, void *) CUDA_NOEXCEPT {}
+
 #pragma pop_macro("CUDA_NOEXCEPT")
 
 #endif // include guard
diff --git a/c_headers/emmintrin.h b/c_headers/emmintrin.h
index 709815cbb4..3372508a7f 100644
--- a/c_headers/emmintrin.h
+++ b/c_headers/emmintrin.h
@@ -2258,7 +2258,11 @@ _mm_adds_epu16(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu8(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+  typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+  return (__m128i)__builtin_convertvector(
+               ((__builtin_convertvector((__v16qu)__a, __v16hu) +
+                 __builtin_convertvector((__v16qu)__b, __v16hu)) + 1)
+                 >> 1, __v16qu);
 }
 
 /// \brief Computes the rounded avarages of corresponding elements of two
@@ -2278,7 +2282,11 @@ _mm_avg_epu8(__m128i __a, __m128i __b)
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_avg_epu16(__m128i __a, __m128i __b)
 {
-  return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+  typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+  return (__m128i)__builtin_convertvector(
+               ((__builtin_convertvector((__v8hu)__a, __v8su) +
+                 __builtin_convertvector((__v8hu)__b, __v8su)) + 1)
+                 >> 1, __v8hu);
 }
 
 /// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16]
diff --git a/c_headers/float.h b/c_headers/float.h
index 502143d4e4..44d4d05494 100644
--- a/c_headers/float.h
+++ b/c_headers/float.h
@@ -143,4 +143,18 @@
 #  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
 #endif
 
+#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
+#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
+#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
+#  define FLT16_DIG         __FLT16_DIG__
+#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
+#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
+#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
+#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
+#  define FLT16_MAX         __FLT16_MAX__
+#  define FLT16_EPSILON     __FLT16_EPSILON__
+#  define FLT16_MIN         __FLT16_MIN__
+#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
+#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
+
 #endif /* __FLOAT_H */
diff --git a/c_headers/immintrin.h b/c_headers/immintrin.h
index c5f25bfcb5..d86e0efb82 100644
--- a/c_headers/immintrin.h
+++ b/c_headers/immintrin.h
@@ -58,6 +58,10 @@
 #include <clflushoptintrin.h>
 #endif
 
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLWB__)
+#include <clwbintrin.h>
+#endif
+
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX__)
 #include <avxintrin.h>
 #endif
diff --git a/c_headers/intrin.h b/c_headers/intrin.h
index 881d05c0d1..b30aa215a4 100644
--- a/c_headers/intrin.h
+++ b/c_headers/intrin.h
@@ -38,6 +38,10 @@
 #include <armintr.h>
 #endif
 
+#if defined(_M_ARM64)
+#include <arm64intr.h>
+#endif
+
 /* For the definition of jmp_buf. */
 #if __STDC_HOSTED__
 #include <setjmp.h>
@@ -828,7 +832,7 @@ _InterlockedCompareExchange_nf(long volatile *_Destination,
                             __ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
   return _Comparand;
 }
-static __inline__ short __DEFAULT_FN_ATTRS
+static __inline__ long __DEFAULT_FN_ATTRS
 _InterlockedCompareExchange_rel(long volatile *_Destination,
                               long _Exchange, long _Comparand) {
   __atomic_compare_exchange(_Destination, &_Comparand, &_Exchange, 0,
diff --git a/c_headers/opencl-c.h b/c_headers/opencl-c.h
index 58c8daf3a5..35fb0a82bc 100644
--- a/c_headers/opencl-c.h
+++ b/c_headers/opencl-c.h
@@ -11381,6 +11381,8 @@ half16 __ovld __cnfn bitselect(half16 a, half16 b, half16 c);
  * For each component of a vector type,
  * result[i] = if MSB of c[i] is set ? b[i] : a[i].
  * For a scalar type, result = c ? b : a.
+ * b and a must have the same type.
+ * c must have the same number of elements and bits as a.
  */
 char __ovld __cnfn select(char a, char b, char c);
 uchar __ovld __cnfn select(uchar a, uchar b, char c);
@@ -11394,60 +11396,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, char8 c);
 uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, char8 c);
 char16 __ovld __cnfn select(char16 a, char16 b, char16 c);
 uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, char16 c);
-short __ovld __cnfn select(short a, short b, char c);
-ushort __ovld __cnfn select(ushort a, ushort b, char c);
-short2 __ovld __cnfn select(short2 a, short2 b, char2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, char2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, char3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, char3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, char4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, char4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, char8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, char8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, char16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, char16 c);
-int __ovld __cnfn select(int a, int b, char c);
-uint __ovld __cnfn select(uint a, uint b, char c);
-int2 __ovld __cnfn select(int2 a, int2 b, char2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, char2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, char3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, char3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, char4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, char4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, char8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, char8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, char16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, char16 c);
-long __ovld __cnfn select(long a, long b, char c);
-ulong __ovld __cnfn select(ulong a, ulong b, char c);
-long2 __ovld __cnfn select(long2 a, long2 b, char2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, char2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, char3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, char3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, char4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, char4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, char8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, char8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, char16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, char16 c);
-float __ovld __cnfn select(float a, float b, char c);
-float2 __ovld __cnfn select(float2 a, float2 b, char2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, char3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, char4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, char8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, char16 c);
-char __ovld __cnfn select(char a, char b, short c);
-uchar __ovld __cnfn select(uchar a, uchar b, short c);
-char2 __ovld __cnfn select(char2 a, char2 b, short2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, short2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, short3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, short3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, short4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, short4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, short8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, short8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, short16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, short16 c);
+
 short __ovld __cnfn select(short a, short b, short c);
 ushort __ovld __cnfn select(ushort a, ushort b, short c);
 short2 __ovld __cnfn select(short2 a, short2 b, short2 c);
@@ -11460,60 +11409,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, short8 c);
 ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, short8 c);
 short16 __ovld __cnfn select(short16 a, short16 b, short16 c);
 ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, short16 c);
-int __ovld __cnfn select(int a, int b, short c);
-uint __ovld __cnfn select(uint a, uint b, short c);
-int2 __ovld __cnfn select(int2 a, int2 b, short2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, short2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, short3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, short3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, short4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, short4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, short8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, short8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, short16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, short16 c);
-long __ovld __cnfn select(long a, long b, short c);
-ulong __ovld __cnfn select(ulong a, ulong b, short c);
-long2 __ovld __cnfn select(long2 a, long2 b, short2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, short2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, short3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, short3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, short4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, short4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, short8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, short8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, short16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, short16 c);
-float __ovld __cnfn select(float a, float b, short c);
-float2 __ovld __cnfn select(float2 a, float2 b, short2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, short3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, short4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, short8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, short16 c);
-char __ovld __cnfn select(char a, char b, int c);
-uchar __ovld __cnfn select(uchar a, uchar b, int c);
-char2 __ovld __cnfn select(char2 a, char2 b, int2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, int2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, int3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, int3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, int4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, int4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, int8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, int8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, int16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, int16 c);
-short __ovld __cnfn select(short a, short b, int c);
-ushort __ovld __cnfn select(ushort a, ushort b, int c);
-short2 __ovld __cnfn select(short2 a, short2 b, int2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, int2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, int3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, int3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, int4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, int4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, int8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, int8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, int16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, int16 c);
+
 int __ovld __cnfn select(int a, int b, int c);
 uint __ovld __cnfn select(uint a, uint b, int c);
 int2 __ovld __cnfn select(int2 a, int2 b, int2 c);
@@ -11526,60 +11422,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, int8 c);
 uint8 __ovld __cnfn select(uint8 a, uint8 b, int8 c);
 int16 __ovld __cnfn select(int16 a, int16 b, int16 c);
 uint16 __ovld __cnfn select(uint16 a, uint16 b, int16 c);
-long __ovld __cnfn select(long a, long b, int c);
-ulong __ovld __cnfn select(ulong a, ulong b, int c);
-long2 __ovld __cnfn select(long2 a, long2 b, int2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, int2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, int3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, int3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, int4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, int4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, int8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, int8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, int16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, int16 c);
 float __ovld __cnfn select(float a, float b, int c);
 float2 __ovld __cnfn select(float2 a, float2 b, int2 c);
 float3 __ovld __cnfn select(float3 a, float3 b, int3 c);
 float4 __ovld __cnfn select(float4 a, float4 b, int4 c);
 float8 __ovld __cnfn select(float8 a, float8 b, int8 c);
 float16 __ovld __cnfn select(float16 a, float16 b, int16 c);
-char __ovld __cnfn select(char a, char b, long c);
-uchar __ovld __cnfn select(uchar a, uchar b, long c);
-char2 __ovld __cnfn select(char2 a, char2 b, long2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, long2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, long3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, long3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, long4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, long4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, long8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, long8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, long16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, long16 c);
-short __ovld __cnfn select(short a, short b, long c);
-ushort __ovld __cnfn select(ushort a, ushort b, long c);
-short2 __ovld __cnfn select(short2 a, short2 b, long2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, long2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, long3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, long3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, long4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, long4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, long8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, long8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, long16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, long16 c);
-int __ovld __cnfn select(int a, int b, long c);
-uint __ovld __cnfn select(uint a, uint b, long c);
-int2 __ovld __cnfn select(int2 a, int2 b, long2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, long2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, long3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, long3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, long4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, long4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, long8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, long8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, long16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, long16 c);
+
 long __ovld __cnfn select(long a, long b, long c);
 ulong __ovld __cnfn select(ulong a, ulong b, long c);
 long2 __ovld __cnfn select(long2 a, long2 b, long2 c);
@@ -11592,12 +11441,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, long8 c);
 ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, long8 c);
 long16 __ovld __cnfn select(long16 a, long16 b, long16 c);
 ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, long16 c);
-float __ovld __cnfn select(float a, float b, long c);
-float2 __ovld __cnfn select(float2 a, float2 b, long2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, long3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, long4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, long8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, long16 c);
+
 char __ovld __cnfn select(char a, char b, uchar c);
 uchar __ovld __cnfn select(uchar a, uchar b, uchar c);
 char2 __ovld __cnfn select(char2 a, char2 b, uchar2 c);
@@ -11610,60 +11454,7 @@ char8 __ovld __cnfn select(char8 a, char8 b, uchar8 c);
 uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uchar8 c);
 char16 __ovld __cnfn select(char16 a, char16 b, uchar16 c);
 uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uchar16 c);
-short __ovld __cnfn select(short a, short b, uchar c);
-ushort __ovld __cnfn select(ushort a, ushort b, uchar c);
-short2 __ovld __cnfn select(short2 a, short2 b, uchar2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uchar2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, uchar3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uchar3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, uchar4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uchar4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, uchar8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uchar8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, uchar16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uchar16 c);
-int __ovld __cnfn select(int a, int b, uchar c);
-uint __ovld __cnfn select(uint a, uint b, uchar c);
-int2 __ovld __cnfn select(int2 a, int2 b, uchar2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, uchar2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, uchar3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, uchar3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, uchar4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, uchar4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, uchar8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, uchar8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, uchar16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, uchar16 c);
-long __ovld __cnfn select(long a, long b, uchar c);
-ulong __ovld __cnfn select(ulong a, ulong b, uchar c);
-long2 __ovld __cnfn select(long2 a, long2 b, uchar2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uchar2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, uchar3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uchar3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, uchar4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uchar4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, uchar8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uchar8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, uchar16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uchar16 c);
-float __ovld __cnfn select(float a, float b, uchar c);
-float2 __ovld __cnfn select(float2 a, float2 b, uchar2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, uchar3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, uchar4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, uchar8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, uchar16 c);
-char __ovld __cnfn select(char a, char b, ushort c);
-uchar __ovld __cnfn select(uchar a, uchar b, ushort c);
-char2 __ovld __cnfn select(char2 a, char2 b, ushort2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ushort2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, ushort3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ushort3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, ushort4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ushort4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, ushort8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ushort8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, ushort16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ushort16 c);
+
 short __ovld __cnfn select(short a, short b, ushort c);
 ushort __ovld __cnfn select(ushort a, ushort b, ushort c);
 short2 __ovld __cnfn select(short2 a, short2 b, ushort2 c);
@@ -11676,60 +11467,7 @@ short8 __ovld __cnfn select(short8 a, short8 b, ushort8 c);
 ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ushort8 c);
 short16 __ovld __cnfn select(short16 a, short16 b, ushort16 c);
 ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ushort16 c);
-int __ovld __cnfn select(int a, int b, ushort c);
-uint __ovld __cnfn select(uint a, uint b, ushort c);
-int2 __ovld __cnfn select(int2 a, int2 b, ushort2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, ushort2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, ushort3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, ushort3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, ushort4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, ushort4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, ushort8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, ushort8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, ushort16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, ushort16 c);
-long __ovld __cnfn select(long a, long b, ushort c);
-ulong __ovld __cnfn select(ulong a, ulong b, ushort c);
-long2 __ovld __cnfn select(long2 a, long2 b, ushort2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, ushort2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, ushort3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, ushort3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, ushort4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, ushort4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, ushort8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ushort8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, ushort16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ushort16 c);
-float __ovld __cnfn select(float a, float b, ushort c);
-float2 __ovld __cnfn select(float2 a, float2 b, ushort2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, ushort3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, ushort4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, ushort8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, ushort16 c);
-char __ovld __cnfn select(char a, char b, uint c);
-uchar __ovld __cnfn select(uchar a, uchar b, uint c);
-char2 __ovld __cnfn select(char2 a, char2 b, uint2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, uint2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, uint3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, uint3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, uint4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, uint4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, uint8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, uint8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, uint16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, uint16 c);
-short __ovld __cnfn select(short a, short b, uint c);
-ushort __ovld __cnfn select(ushort a, ushort b, uint c);
-short2 __ovld __cnfn select(short2 a, short2 b, uint2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, uint2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, uint3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, uint3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, uint4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, uint4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, uint8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, uint8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, uint16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, uint16 c);
+
 int __ovld __cnfn select(int a, int b, uint c);
 uint __ovld __cnfn select(uint a, uint b, uint c);
 int2 __ovld __cnfn select(int2 a, int2 b, uint2 c);
@@ -11742,60 +11480,13 @@ int8 __ovld __cnfn select(int8 a, int8 b, uint8 c);
 uint8 __ovld __cnfn select(uint8 a, uint8 b, uint8 c);
 int16 __ovld __cnfn select(int16 a, int16 b, uint16 c);
 uint16 __ovld __cnfn select(uint16 a, uint16 b, uint16 c);
-long __ovld __cnfn select(long a, long b, uint c);
-ulong __ovld __cnfn select(ulong a, ulong b, uint c);
-long2 __ovld __cnfn select(long2 a, long2 b, uint2 c);
-ulong2 __ovld __cnfn select(ulong2 a, ulong2 b, uint2 c);
-long3 __ovld __cnfn select(long3 a, long3 b, uint3 c);
-ulong3 __ovld __cnfn select(ulong3 a, ulong3 b, uint3 c);
-long4 __ovld __cnfn select(long4 a, long4 b, uint4 c);
-ulong4 __ovld __cnfn select(ulong4 a, ulong4 b, uint4 c);
-long8 __ovld __cnfn select(long8 a, long8 b, uint8 c);
-ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, uint8 c);
-long16 __ovld __cnfn select(long16 a, long16 b, uint16 c);
-ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, uint16 c);
 float __ovld __cnfn select(float a, float b, uint c);
 float2 __ovld __cnfn select(float2 a, float2 b, uint2 c);
 float3 __ovld __cnfn select(float3 a, float3 b, uint3 c);
 float4 __ovld __cnfn select(float4 a, float4 b, uint4 c);
 float8 __ovld __cnfn select(float8 a, float8 b, uint8 c);
 float16 __ovld __cnfn select(float16 a, float16 b, uint16 c);
-char __ovld __cnfn select(char a, char b, ulong c);
-uchar __ovld __cnfn select(uchar a, uchar b, ulong c);
-char2 __ovld __cnfn select(char2 a, char2 b, ulong2 c);
-uchar2 __ovld __cnfn select(uchar2 a, uchar2 b, ulong2 c);
-char3 __ovld __cnfn select(char3 a, char3 b, ulong3 c);
-uchar3 __ovld __cnfn select(uchar3 a, uchar3 b, ulong3 c);
-char4 __ovld __cnfn select(char4 a, char4 b, ulong4 c);
-uchar4 __ovld __cnfn select(uchar4 a, uchar4 b, ulong4 c);
-char8 __ovld __cnfn select(char8 a, char8 b, ulong8 c);
-uchar8 __ovld __cnfn select(uchar8 a, uchar8 b, ulong8 c);
-char16 __ovld __cnfn select(char16 a, char16 b, ulong16 c);
-uchar16 __ovld __cnfn select(uchar16 a, uchar16 b, ulong16 c);
-short __ovld __cnfn select(short a, short b, ulong c);
-ushort __ovld __cnfn select(ushort a, ushort b, ulong c);
-short2 __ovld __cnfn select(short2 a, short2 b, ulong2 c);
-ushort2 __ovld __cnfn select(ushort2 a, ushort2 b, ulong2 c);
-short3 __ovld __cnfn select(short3 a, short3 b, ulong3 c);
-ushort3 __ovld __cnfn select(ushort3 a, ushort3 b, ulong3 c);
-short4 __ovld __cnfn select(short4 a, short4 b, ulong4 c);
-ushort4 __ovld __cnfn select(ushort4 a, ushort4 b, ulong4 c);
-short8 __ovld __cnfn select(short8 a, short8 b, ulong8 c);
-ushort8 __ovld __cnfn select(ushort8 a, ushort8 b, ulong8 c);
-short16 __ovld __cnfn select(short16 a, short16 b, ulong16 c);
-ushort16 __ovld __cnfn select(ushort16 a, ushort16 b, ulong16 c);
-int __ovld __cnfn select(int a, int b, ulong c);
-uint __ovld __cnfn select(uint a, uint b, ulong c);
-int2 __ovld __cnfn select(int2 a, int2 b, ulong2 c);
-uint2 __ovld __cnfn select(uint2 a, uint2 b, ulong2 c);
-int3 __ovld __cnfn select(int3 a, int3 b, ulong3 c);
-uint3 __ovld __cnfn select(uint3 a, uint3 b, ulong3 c);
-int4 __ovld __cnfn select(int4 a, int4 b, ulong4 c);
-uint4 __ovld __cnfn select(uint4 a, uint4 b, ulong4 c);
-int8 __ovld __cnfn select(int8 a, int8 b, ulong8 c);
-uint8 __ovld __cnfn select(uint8 a, uint8 b, ulong8 c);
-int16 __ovld __cnfn select(int16 a, int16 b, ulong16 c);
-uint16 __ovld __cnfn select(uint16 a, uint16 b, ulong16 c);
+
 long __ovld __cnfn select(long a, long b, ulong c);
 ulong __ovld __cnfn select(ulong a, ulong b, ulong c);
 long2 __ovld __cnfn select(long2 a, long2 b, ulong2 c);
@@ -11808,12 +11499,7 @@ long8 __ovld __cnfn select(long8 a, long8 b, ulong8 c);
 ulong8 __ovld __cnfn select(ulong8 a, ulong8 b, ulong8 c);
 long16 __ovld __cnfn select(long16 a, long16 b, ulong16 c);
 ulong16 __ovld __cnfn select(ulong16 a, ulong16 b, ulong16 c);
-float __ovld __cnfn select(float a, float b, ulong c);
-float2 __ovld __cnfn select(float2 a, float2 b, ulong2 c);
-float3 __ovld __cnfn select(float3 a, float3 b, ulong3 c);
-float4 __ovld __cnfn select(float4 a, float4 b, ulong4 c);
-float8 __ovld __cnfn select(float8 a, float8 b, ulong8 c);
-float16 __ovld __cnfn select(float16 a, float16 b, ulong16 c);
+
 #ifdef cl_khr_fp64
 double __ovld __cnfn select(double a, double b, long c);
 double2 __ovld __cnfn select(double2 a, double2 b, long2 c);
@@ -13141,13 +12827,14 @@ void __ovld __conv barrier(cl_mem_fence_flags flags);
 
 #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
-typedef enum memory_scope
-{
-  memory_scope_work_item,
-  memory_scope_work_group,
-  memory_scope_device,
-  memory_scope_all_svm_devices,
-  memory_scope_sub_group
+typedef enum memory_scope {
+  memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
+  memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
+  memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
+  memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
+#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
+  memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
+#endif
 } memory_scope;
 
 void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
@@ -13952,11 +13639,11 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v
 // enum values aligned with what clang uses in EmitAtomicExpr()
 typedef enum memory_order
 {
-  memory_order_relaxed,
-  memory_order_acquire,
-  memory_order_release,
-  memory_order_acq_rel,
-  memory_order_seq_cst
+  memory_order_relaxed = __ATOMIC_RELAXED,
+  memory_order_acquire = __ATOMIC_ACQUIRE,
+  memory_order_release = __ATOMIC_RELEASE,
+  memory_order_acq_rel = __ATOMIC_ACQ_REL,
+  memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
 // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
diff --git a/c_headers/unwind.h b/c_headers/unwind.h
index 4f74a34787..345fa4d0c1 100644
--- a/c_headers/unwind.h
+++ b/c_headers/unwind.h
@@ -76,7 +76,13 @@ typedef intptr_t _sleb128_t;
 typedef uintptr_t _uleb128_t;
 
 struct _Unwind_Context;
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+struct _Unwind_Control_Block;
+typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
+#else
 struct _Unwind_Exception;
+typedef struct _Unwind_Exception _Unwind_Exception;
+#endif
 typedef enum {
   _URC_NO_REASON = 0,
 #if defined(__arm__) && !defined(__USING_SJLJ_EXCEPTIONS__) && \
@@ -109,8 +115,42 @@ typedef enum {
 } _Unwind_Action;
 
 typedef void (*_Unwind_Exception_Cleanup_Fn)(_Unwind_Reason_Code,
-                                             struct _Unwind_Exception *);
-
+                                             _Unwind_Exception *);
+
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+typedef uint32_t _Unwind_EHT_Header;
+
+struct _Unwind_Control_Block {
+  uint64_t exception_class;
+  void (*exception_cleanup)(_Unwind_Reason_Code, _Unwind_Control_Block *);
+  /* unwinder cache (private fields for the unwinder's use) */
+  struct {
+    uint32_t reserved1; /* forced unwind stop function, 0 if not forced */
+    uint32_t reserved2; /* personality routine */
+    uint32_t reserved3; /* callsite */
+    uint32_t reserved4; /* forced unwind stop argument */
+    uint32_t reserved5;
+  } unwinder_cache;
+  /* propagation barrier cache (valid after phase 1) */
+  struct {
+    uint32_t sp;
+    uint32_t bitpattern[5];
+  } barrier_cache;
+  /* cleanup cache (preserved over cleanup) */
+  struct {
+    uint32_t bitpattern[4];
+  } cleanup_cache;
+  /* personality cache (for personality's benefit) */
+  struct {
+    uint32_t fnstart;         /* function start address */
+    _Unwind_EHT_Header *ehtp; /* pointer to EHT entry header word */
+    uint32_t additional;      /* additional data */
+    uint32_t reserved1;
+  } pr_cache;
+  long long int : 0; /* force alignment of next item to 8-byte boundary */
+} __attribute__((__aligned__(8)));
+#else
 struct _Unwind_Exception {
   _Unwind_Exception_Class exception_class;
   _Unwind_Exception_Cleanup_Fn exception_cleanup;
@@ -120,23 +160,24 @@ struct _Unwind_Exception {
    * aligned".  GCC has interpreted this to mean "use the maximum useful
    * alignment for the target"; so do we. */
 } __attribute__((__aligned__));
+#endif
 
 typedef _Unwind_Reason_Code (*_Unwind_Stop_Fn)(int, _Unwind_Action,
                                                _Unwind_Exception_Class,
-                                               struct _Unwind_Exception *,
+                                               _Unwind_Exception *,
                                                struct _Unwind_Context *,
                                                void *);
 
-typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(
-    int, _Unwind_Action, _Unwind_Exception_Class, struct _Unwind_Exception *,
-    struct _Unwind_Context *);
+typedef _Unwind_Reason_Code (*_Unwind_Personality_Fn)(int, _Unwind_Action,
+                                                      _Unwind_Exception_Class,
+                                                      _Unwind_Exception *,
+                                                      struct _Unwind_Context *);
 typedef _Unwind_Personality_Fn __personality_routine;
 
 typedef _Unwind_Reason_Code (*_Unwind_Trace_Fn)(struct _Unwind_Context *,
                                                 void *);
 
-#if defined(__arm__) && !defined(__APPLE__)
-
+#if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || defined(__ARM_DWARF_EH__))
 typedef enum {
   _UVRSC_CORE = 0,        /* integer register */
   _UVRSC_VFP = 1,         /* vfp */
@@ -158,14 +199,12 @@ typedef enum {
   _UVRSR_FAILED = 2
 } _Unwind_VRS_Result;
 
-#if !defined(__USING_SJLJ_EXCEPTIONS__) && !defined(__ARM_DWARF_EH__)
 typedef uint32_t _Unwind_State;
 #define _US_VIRTUAL_UNWIND_FRAME  ((_Unwind_State)0)
 #define _US_UNWIND_FRAME_STARTING ((_Unwind_State)1)
 #define _US_UNWIND_FRAME_RESUME   ((_Unwind_State)2)
 #define _US_ACTION_MASK           ((_Unwind_State)3)
 #define _US_FORCE_UNWIND          ((_Unwind_State)8)
-#endif
 
 _Unwind_VRS_Result _Unwind_VRS_Get(struct _Unwind_Context *__context,
   _Unwind_VRS_RegClass __regclass,
@@ -224,13 +263,12 @@ _Unwind_Ptr _Unwind_GetRegionStart(struct _Unwind_Context *);
 
 /* DWARF EH functions; currently not available on Darwin/ARM */
 #if !defined(__APPLE__) || !defined(__arm__)
-
-_Unwind_Reason_Code _Unwind_RaiseException(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_ForcedUnwind(struct _Unwind_Exception *,
-                                         _Unwind_Stop_Fn, void *);
-void _Unwind_DeleteException(struct _Unwind_Exception *);
-void _Unwind_Resume(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(struct _Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_ForcedUnwind(_Unwind_Exception *, _Unwind_Stop_Fn,
+                                         void *);
+void _Unwind_DeleteException(_Unwind_Exception *);
+void _Unwind_Resume(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_Resume_or_Rethrow(_Unwind_Exception *);
 
 #endif
 
@@ -241,11 +279,11 @@ typedef struct SjLj_Function_Context *_Unwind_FunctionContext_t;
 
 void _Unwind_SjLj_Register(_Unwind_FunctionContext_t);
 void _Unwind_SjLj_Unregister(_Unwind_FunctionContext_t);
-_Unwind_Reason_Code _Unwind_SjLj_RaiseException(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(struct _Unwind_Exception *,
+_Unwind_Reason_Code _Unwind_SjLj_RaiseException(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_ForcedUnwind(_Unwind_Exception *,
                                               _Unwind_Stop_Fn, void *);
-void _Unwind_SjLj_Resume(struct _Unwind_Exception *);
-_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(struct _Unwind_Exception *);
+void _Unwind_SjLj_Resume(_Unwind_Exception *);
+_Unwind_Reason_Code _Unwind_SjLj_Resume_or_Rethrow(_Unwind_Exception *);
 
 void *_Unwind_FindEnclosingFunction(void *);
author	Andrew Kelley <superjoe30@gmail.com>	2017-10-23 21:43:18 -0400
committer	Andrew Kelley <superjoe30@gmail.com>	2017-11-02 21:54:24 -0400
commit	94ec2190f8d8c41d19b668511bf31fae32bcd095 (patch)
tree	c5ab9b20fbaf8f017661f9a159082d1ecaf9f943 /c_headers
parent	abff1b688420eb30d98145d8bc48e7d08f259885 (diff)
download	zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.tar.gz zig-94ec2190f8d8c41d19b668511bf31fae32bcd095.zip