diff options
| author | Andrew Kelley <superjoe30@gmail.com> | 2018-02-23 13:15:16 -0500 |
|---|---|---|
| committer | Andrew Kelley <superjoe30@gmail.com> | 2018-02-23 13:15:16 -0500 |
| commit | 4955c4b8f99bc45ad9aacb13de691614c4e0ad38 (patch) | |
| tree | fb918ba58cfc7da81ab747fe2bc51cfc316d45ec /c_headers | |
| parent | 1ba6e1641a4c5ea1d0d665fe500c9c66d69443a4 (diff) | |
| download | zig-4955c4b8f99bc45ad9aacb13de691614c4e0ad38.tar.gz zig-4955c4b8f99bc45ad9aacb13de691614c4e0ad38.zip | |
update C headers to clang 6.0.0rc3
Diffstat (limited to 'c_headers')
33 files changed, 8991 insertions, 3855 deletions
diff --git a/c_headers/__clang_cuda_cmath.h b/c_headers/__clang_cuda_cmath.h index 9bef82611a..5331ba401a 100644 --- a/c_headers/__clang_cuda_cmath.h +++ b/c_headers/__clang_cuda_cmath.h @@ -131,15 +131,6 @@ __DEVICE__ float ldexp(float __arg, int __exp) { __DEVICE__ float log(float __x) { return ::logf(__x); } __DEVICE__ float log10(float __x) { return ::log10f(__x); } __DEVICE__ float modf(float __x, float *__iptr) { return ::modff(__x, __iptr); } -__DEVICE__ float nexttoward(float __from, double __to) { - return __builtin_nexttowardf(__from, __to); -} -__DEVICE__ double nexttoward(double __from, double __to) { - return __builtin_nexttoward(__from, __to); -} -__DEVICE__ float nexttowardf(float __from, double __to) { - return __builtin_nexttowardf(__from, __to); -} __DEVICE__ float pow(float __base, float __exp) { return ::powf(__base, __exp); } @@ -157,6 +148,10 @@ __DEVICE__ float sqrt(float __x) { return ::sqrtf(__x); } __DEVICE__ float tan(float __x) { return ::tanf(__x); } __DEVICE__ float tanh(float __x) { return ::tanhf(__x); } +// Notably missing above is nexttoward. We omit it because +// libdevice doesn't provide an implementation, and we don't want to be in the +// business of implementing tricky libm functions in this header. + // Now we've defined everything we promised we'd define in // __clang_cuda_math_forward_declares.h. We need to do two additional things to // fix up our math functions. @@ -295,13 +290,6 @@ ldexp(__T __x, int __exp) { return std::ldexp((double)__x, __exp); } -template <typename __T> -__DEVICE__ typename __clang_cuda_enable_if<std::numeric_limits<__T>::is_integer, - double>::type -nexttoward(__T __from, double __to) { - return std::nexttoward((double)__from, __to); -} - template <typename __T1, typename __T2> __DEVICE__ typename __clang_cuda_enable_if< std::numeric_limits<__T1>::is_specialized && @@ -388,7 +376,6 @@ using ::lrint; using ::lround; using ::nearbyint; using ::nextafter; -using ::nexttoward; using ::pow; using ::remainder; using ::remquo; @@ -456,8 +443,6 @@ using ::lroundf; using ::modff; using ::nearbyintf; using ::nextafterf; -using ::nexttowardf; -using ::nexttowardf; using ::powf; using ::remainderf; using ::remquof; diff --git a/c_headers/__clang_cuda_intrinsics.h b/c_headers/__clang_cuda_intrinsics.h index bc5b876577..1794eb3dc1 100644 --- a/c_headers/__clang_cuda_intrinsics.h +++ b/c_headers/__clang_cuda_intrinsics.h @@ -34,23 +34,24 @@ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 #pragma push_macro("__MAKE_SHUFFLES") -#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask) \ - inline __device__ int __FnName(int __val, int __offset, \ +#define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ + __Type) \ + inline __device__ int __FnName(int __val, __Type __offset, \ int __width = warpSize) { \ return __IntIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ float __FnName(float __val, int __offset, \ + inline __device__ float __FnName(float __val, __Type __offset, \ int __width = warpSize) { \ return __FloatIntrinsic(__val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ - inline __device__ unsigned int __FnName(unsigned int __val, int __offset, \ + inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ int __width = warpSize) { \ return static_cast<unsigned int>( \ ::__FnName(static_cast<int>(__val), __offset, __width)); \ } \ - inline __device__ long long __FnName(long long __val, int __offset, \ + inline __device__ long long __FnName(long long __val, __Type __offset, \ int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ @@ -65,12 +66,29 @@ memcpy(&__ret, &__tmp, sizeof(__tmp)); \ return __ret; \ } \ + inline __device__ long __FnName(long __val, __Type __offset, \ + int __width = warpSize) { \ + _Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ + if (sizeof(long) == sizeof(long long)) { \ + return static_cast<long>( \ + ::__FnName(static_cast<long long>(__val), __offset, __width)); \ + } else if (sizeof(long) == sizeof(int)) { \ + return static_cast<long>( \ + ::__FnName(static_cast<int>(__val), __offset, __width)); \ + } \ + } \ + inline __device__ unsigned long __FnName( \ + unsigned long __val, __Type __offset, int __width = warpSize) { \ + return static_cast<unsigned long>( \ + ::__FnName(static_cast<long>(__val), __offset, __width)); \ + } \ inline __device__ unsigned long long __FnName( \ - unsigned long long __val, int __offset, int __width = warpSize) { \ + unsigned long long __val, __Type __offset, int __width = warpSize) { \ return static_cast<unsigned long long>(::__FnName( \ static_cast<unsigned long long>(__val), __offset, __width)); \ } \ - inline __device__ double __FnName(double __val, int __offset, \ + inline __device__ double __FnName(double __val, __Type __offset, \ int __width = warpSize) { \ long long __tmp; \ _Static_assert(sizeof(__tmp) == sizeof(__val)); \ @@ -81,13 +99,15 @@ return __ret; \ } -__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f); +__MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= // maxLane. -__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0); -__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f); -__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); - +__MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, + unsigned int); +__MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, + unsigned int); +__MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, + int); #pragma pop_macro("__MAKE_SHUFFLES") #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 @@ -97,25 +117,26 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); // __shfl_sync_* variants available in CUDA-9 #pragma push_macro("__MAKE_SYNC_SHUFFLES") #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ - __Mask) \ - inline __device__ int __FnName(unsigned int __mask, int __val, int __offset, \ - int __width = warpSize) { \ + __Mask, __Type) \ + inline __device__ int __FnName(unsigned int __mask, int __val, \ + __Type __offset, int __width = warpSize) { \ return __IntIntrinsic(__mask, __val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ inline __device__ float __FnName(unsigned int __mask, float __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, int __width = warpSize) { \ return __FloatIntrinsic(__mask, __val, __offset, \ ((warpSize - __width) << 8) | (__Mask)); \ } \ inline __device__ unsigned int __FnName(unsigned int __mask, \ - unsigned int __val, int __offset, \ + unsigned int __val, __Type __offset, \ int __width = warpSize) { \ return static_cast<unsigned int>( \ ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ } \ inline __device__ long long __FnName(unsigned int __mask, long long __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, \ + int __width = warpSize) { \ struct __Bits { \ int __a, __b; \ }; \ @@ -130,13 +151,31 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); return __ret; \ } \ inline __device__ unsigned long long __FnName( \ - unsigned int __mask, unsigned long long __val, int __offset, \ + unsigned int __mask, unsigned long long __val, __Type __offset, \ int __width = warpSize) { \ return static_cast<unsigned long long>(::__FnName( \ __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ } \ + inline __device__ long __FnName(unsigned int __mask, long __val, \ + __Type __offset, int __width = warpSize) { \ + _Static_assert(sizeof(long) == sizeof(long long) || \ + sizeof(long) == sizeof(int)); \ + if (sizeof(long) == sizeof(long long)) { \ + return static_cast<long>(::__FnName( \ + __mask, static_cast<long long>(__val), __offset, __width)); \ + } else if (sizeof(long) == sizeof(int)) { \ + return static_cast<long>( \ + ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ + } \ + } \ + inline __device__ unsigned long __FnName( \ + unsigned int __mask, unsigned long __val, __Type __offset, \ + int __width = warpSize) { \ + return static_cast<unsigned long>( \ + ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ + } \ inline __device__ double __FnName(unsigned int __mask, double __val, \ - int __offset, int __width = warpSize) { \ + __Type __offset, int __width = warpSize) { \ long long __tmp; \ _Static_assert(sizeof(__tmp) == sizeof(__val)); \ memcpy(&__tmp, &__val, sizeof(__val)); \ @@ -146,15 +185,15 @@ __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f); return __ret; \ } __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, - __nvvm_shfl_sync_idx_f32, 0x1f); + __nvvm_shfl_sync_idx_f32, 0x1f, int); // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= // maxLane. __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, - __nvvm_shfl_sync_up_f32, 0); + __nvvm_shfl_sync_up_f32, 0, unsigned int); __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, - __nvvm_shfl_sync_down_f32, 0x1f); + __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, - __nvvm_shfl_sync_bfly_f32, 0x1f); + __nvvm_shfl_sync_bfly_f32, 0x1f, int); #pragma pop_macro("__MAKE_SYNC_SHUFFLES") inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { @@ -188,6 +227,10 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } +inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { + return __nvvm_fns(mask, base, offset); +} + #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 // Define __match* builtins CUDA-9 headers expect to see. diff --git a/c_headers/__clang_cuda_math_forward_declares.h b/c_headers/__clang_cuda_math_forward_declares.h index 49c805151d..c31b1f4cda 100644 --- a/c_headers/__clang_cuda_math_forward_declares.h +++ b/c_headers/__clang_cuda_math_forward_declares.h @@ -149,9 +149,6 @@ __DEVICE__ double nearbyint(double); __DEVICE__ float nearbyint(float); __DEVICE__ double nextafter(double, double); __DEVICE__ float nextafter(float, float); -__DEVICE__ double nexttoward(double, double); -__DEVICE__ float nexttoward(float, double); -__DEVICE__ float nexttowardf(float, double); __DEVICE__ double pow(double, double); __DEVICE__ double pow(double, int); __DEVICE__ float pow(float, float); @@ -185,6 +182,10 @@ __DEVICE__ float tgamma(float); __DEVICE__ double trunc(double); __DEVICE__ float trunc(float); +// Notably missing above is nexttoward, which we don't define on +// the device side because libdevice doesn't give us an implementation, and we +// don't want to be in the business of writing one ourselves. + // We need to define these overloads in exactly the namespace our standard // library uses (including the right inline namespace), otherwise they won't be // picked up by other functions in the standard library (e.g. functions in @@ -255,7 +256,6 @@ using ::nan; using ::nanf; using ::nearbyint; using ::nextafter; -using ::nexttoward; using ::pow; using ::remainder; using ::remquo; diff --git a/c_headers/__clang_cuda_runtime_wrapper.h b/c_headers/__clang_cuda_runtime_wrapper.h index b8ffc2ce9f..a82a8490f3 100644 --- a/c_headers/__clang_cuda_runtime_wrapper.h +++ b/c_headers/__clang_cuda_runtime_wrapper.h @@ -270,12 +270,18 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); } // include guard from math.h wrapper from libstdc++. We have to undo the header // guard temporarily to get the definitions we need. #pragma push_macro("_GLIBCXX_MATH_H") +#pragma push_macro("_LIBCPP_VERSION") #if CUDA_VERSION >= 9000 #undef _GLIBCXX_MATH_H +// We also need to undo another guard that checks for libc++ 3.8+ +#ifdef _LIBCPP_VERSION +#define _LIBCPP_VERSION 3700 +#endif #endif #include "math_functions.hpp" #pragma pop_macro("_GLIBCXX_MATH_H") +#pragma pop_macro("_LIBCPP_VERSION") #pragma pop_macro("__GNUC__") #pragma pop_macro("signbit") diff --git a/c_headers/arm_neon.h b/c_headers/arm_neon.h index f5ca59bb32..3da63d994d 100644 --- a/c_headers/arm_neon.h +++ b/c_headers/arm_neon.h @@ -40461,6 +40461,3318 @@ __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) #endif #endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabdq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabd_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vabd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vabsq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabsq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vabsq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vabsq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vabs_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabs_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vabs_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vabs_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 + __p1; + return __ret; +} +#else +__ai float16x8_t vaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 + __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 + __p1; + return __ret; +} +#else +__ai float16x4_t vadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 + __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vbslq_f16(uint16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vbslq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vbsl_f16(uint16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vbsl_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcageq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcageq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcageq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcage_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcage_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcage_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcagtq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcagtq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcagtq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcagt_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcagt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcagt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaleq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcaleq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaleq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcale_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcale_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcale_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaltq_v((int8x16_t)__p0, (int8x16_t)__p1, 49); + return __ret; +} +#else +__ai uint16x8_t vcaltq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcaltq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcalt_v((int8x8_t)__p0, (int8x8_t)__p1, 17); + return __ret; +} +#else +__ai uint16x4_t vcalt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcalt_v((int8x8_t)__rev0, (int8x8_t)__rev1, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 == __p1); + return __ret; +} +#else +__ai uint16x8_t vceqq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 == __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 == __p1); + return __ret; +} +#else +__ai uint16x4_t vceq_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 == __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vceqzq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vceqzq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vceqzq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vceqz_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vceqz_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vceqz_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 >= __p1); + return __ret; +} +#else +__ai uint16x8_t vcgeq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 >= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 >= __p1); + return __ret; +} +#else +__ai uint16x4_t vcge_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 >= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgezq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcgezq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgezq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcgez_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcgez_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgez_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 > __p1); + return __ret; +} +#else +__ai uint16x8_t vcgtq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 > __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 > __p1); + return __ret; +} +#else +__ai uint16x4_t vcgt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 > __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcgtzq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcgtzq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcgtzq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcgtz_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcgtz_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcgtz_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 <= __p1); + return __ret; +} +#else +__ai uint16x8_t vcleq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 <= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 <= __p1); + return __ret; +} +#else +__ai uint16x4_t vcle_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 <= __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vclezq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vclezq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vclezq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vclez_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vclez_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vclez_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) { + uint16x8_t __ret; + __ret = (uint16x8_t)(__p0 < __p1); + return __ret; +} +#else +__ai uint16x8_t vcltq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t)(__rev0 < __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) { + uint16x4_t __ret; + __ret = (uint16x4_t)(__p0 < __p1); + return __ret; +} +#else +__ai uint16x4_t vclt_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t)(__rev0 < __rev1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcltzq_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcltzq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcltzq_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcltz_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcltz_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcltz_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcvtq_f16_u16(uint16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai float16x8_t vcvtq_f16_u16(uint16x8_t __p0) { + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vcvtq_f16_s16(int16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai float16x8_t vcvtq_f16_s16(int16x8_t __p0) { + int16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vcvtq_f16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcvt_f16_u16(uint16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai float16x4_t vcvt_f16_u16(uint16x4_t __p0) { + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vcvt_f16_s16(int16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai float16x4_t vcvt_f16_s16(int16x4_t __p0) { + int16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vcvt_f16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x8_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__s0, __p1, 49); \ + __ret; \ +}) +#else +#define vcvtq_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x8_t __s0 = __p0; \ + uint16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__rev0, __p1, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x8_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__s0, __p1, 33); \ + __ret; \ +}) +#else +#define vcvtq_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x8_t __s0 = __p0; \ + int16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vcvtq_n_f16_v((int8x16_t)__rev0, __p1, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x4_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__s0, __p1, 17); \ + __ret; \ +}) +#else +#define vcvt_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16x4_t __s0 = __p0; \ + uint16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__rev0, __p1, 17); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x4_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__s0, __p1, 1); \ + __ret; \ +}) +#else +#define vcvt_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16x4_t __s0 = __p0; \ + int16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vcvt_n_f16_v((int8x8_t)__rev0, __p1, 1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + int16x8_t __ret; \ + __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_v((int8x16_t)__s0, __p1, 33); \ + __ret; \ +}) +#else +#define vcvtq_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __ret; \ + __ret = (int16x8_t) __builtin_neon_vcvtq_n_s16_v((int8x16_t)__rev0, __p1, 33); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + int16x4_t __ret; \ + __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_v((int8x8_t)__s0, __p1, 1); \ + __ret; \ +}) +#else +#define vcvt_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + int16x4_t __ret; \ + __ret = (int16x4_t) __builtin_neon_vcvt_n_s16_v((int8x8_t)__rev0, __p1, 1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + uint16x8_t __ret; \ + __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_v((int8x16_t)__s0, __p1, 49); \ + __ret; \ +}) +#else +#define vcvtq_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __ret; \ + __ret = (uint16x8_t) __builtin_neon_vcvtq_n_u16_v((int8x16_t)__rev0, __p1, 49); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + uint16x4_t __ret; \ + __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_v((int8x8_t)__s0, __p1, 17); \ + __ret; \ +}) +#else +#define vcvt_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint16x4_t __ret; \ + __ret = (uint16x4_t) __builtin_neon_vcvt_n_u16_v((int8x8_t)__rev0, __p1, 17); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvt_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvt_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvt_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvt_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvt_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvt_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvt_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvt_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtaq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtaq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtaq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvta_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvta_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvta_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvta_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtaq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtaq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvta_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvta_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvta_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvta_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtmq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtmq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtmq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvtm_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtm_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvtm_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtm_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtmq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtmq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvtm_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvtm_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtm_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtnq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtnq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtnq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvtn_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtn_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvtn_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtn_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtnq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtnq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvtn_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvtn_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtn_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x8_t vcvtpq_s16_f16(float16x8_t __p0) { + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_v((int8x16_t)__p0, 33); + return __ret; +} +#else +__ai int16x8_t vcvtpq_s16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret; + __ret = (int16x8_t) __builtin_neon_vcvtpq_s16_v((int8x16_t)__rev0, 33); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai int16x4_t vcvtp_s16_f16(float16x4_t __p0) { + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtp_s16_v((int8x8_t)__p0, 1); + return __ret; +} +#else +__ai int16x4_t vcvtp_s16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int16x4_t __ret; + __ret = (int16x4_t) __builtin_neon_vcvtp_s16_v((int8x8_t)__rev0, 1); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) { + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_v((int8x16_t)__p0, 49); + return __ret; +} +#else +__ai uint16x8_t vcvtpq_u16_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret; + __ret = (uint16x8_t) __builtin_neon_vcvtpq_u16_v((int8x16_t)__rev0, 49); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) { + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_v((int8x8_t)__p0, 17); + return __ret; +} +#else +__ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint16x4_t __ret; + __ret = (uint16x4_t) __builtin_neon_vcvtp_u16_v((int8x8_t)__rev0, 17); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 / __p1; + return __ret; +} +#else +__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 / __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 / __p1; + return __ret; +} +#else +__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 / __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__s0, __p1); \ + __ret; \ +}) +#else +#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__rev0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__s0, __p1); \ + __ret; \ +}) +#else +#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__rev0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 8); \ + __ret; \ +}) +#else +#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#else +__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +__ai float16x8_t __noswap_vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +__ai float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__s2, __p3); \ + __ret; \ +}) +#else +#define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__rev2, __p3); \ + __ret; \ +}) +#define __noswap_vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_lane_f16(__s0, __s1, (int8x8_t)__s2, __p3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \ + __ret; \ +}) +#else +#define vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x8_t)__rev2, __p3, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfmaq_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_lane_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x8_t)__s2, __p3, 40); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \ + __ret; \ +}) +#else +#define vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, __p3, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfma_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_lane_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x8_t)__s2, __p3, 8); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__s2, __p3); \ + __ret; \ +}) +#else +#define vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__rev2, __p3); \ + __ret; \ +}) +#define __noswap_vfmah_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_laneq_f16(__s0, __s1, (int8x16_t)__s2, __p3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \ + __ret; \ +}) +#else +#define vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, __p3, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfmaq_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vfmaq_laneq_v((int8x16_t)__s0, (int8x16_t)__s1, (int8x16_t)__s2, __p3, 40); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \ + __ret; \ +}) +#else +#define vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x16_t)__rev2, __p3, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#define __noswap_vfma_laneq_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vfma_laneq_v((int8x8_t)__s0, (int8x8_t)__s1, (int8x16_t)__s2, __p3, 8); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = vfmaq_f16(__s0, __s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfmaq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vfmaq_f16(__rev0, __rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = vfma_f16(__s0, __s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfma_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vfma_f16(__rev0, __rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = vfmaq_f16(__p0, -__p1, __p2); + return __ret; +} +#else +__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __noswap_vfmaq_f16(__rev0, -__rev1, __rev2); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = vfma_f16(__p0, -__p1, __p2); + return __ret; +} +#else +__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __noswap_vfma_f16(__rev0, -__rev1, __rev2); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsh_lane_f16(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \ + float16_t __s0_0 = __p0_0; \ + float16_t __s1_0 = __p1_0; \ + float16x4_t __s2_0 = __p2_0; \ + float16_t __ret_0; \ + __ret_0 = vfmah_lane_f16(__s0_0, -__s1_0, __s2_0, __p3_0); \ + __ret_0; \ +}) +#else +#define vfmsh_lane_f16(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \ + float16_t __s0_1 = __p0_1; \ + float16_t __s1_1 = __p1_1; \ + float16x4_t __s2_1 = __p2_1; \ + float16x4_t __rev2_1; __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 3, 2, 1, 0); \ + float16_t __ret_1; \ + __ret_1 = __noswap_vfmah_lane_f16(__s0_1, -__s1_1, __rev2_1, __p3_1); \ + __ret_1; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_lane_f16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \ + float16x8_t __s0_2 = __p0_2; \ + float16x8_t __s1_2 = __p1_2; \ + float16x4_t __s2_2 = __p2_2; \ + float16x8_t __ret_2; \ + __ret_2 = vfmaq_lane_f16(__s0_2, -__s1_2, __s2_2, __p3_2); \ + __ret_2; \ +}) +#else +#define vfmsq_lane_f16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \ + float16x8_t __s0_3 = __p0_3; \ + float16x8_t __s1_3 = __p1_3; \ + float16x4_t __s2_3 = __p2_3; \ + float16x8_t __rev0_3; __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1_3; __rev1_3 = __builtin_shufflevector(__s1_3, __s1_3, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev2_3; __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \ + float16x8_t __ret_3; \ + __ret_3 = __noswap_vfmaq_lane_f16(__rev0_3, -__rev1_3, __rev2_3, __p3_3); \ + __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_3; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_lane_f16(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \ + float16x4_t __s0_4 = __p0_4; \ + float16x4_t __s1_4 = __p1_4; \ + float16x4_t __s2_4 = __p2_4; \ + float16x4_t __ret_4; \ + __ret_4 = vfma_lane_f16(__s0_4, -__s1_4, __s2_4, __p3_4); \ + __ret_4; \ +}) +#else +#define vfms_lane_f16(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \ + float16x4_t __s0_5 = __p0_5; \ + float16x4_t __s1_5 = __p1_5; \ + float16x4_t __s2_5 = __p2_5; \ + float16x4_t __rev0_5; __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 3, 2, 1, 0); \ + float16x4_t __rev1_5; __rev1_5 = __builtin_shufflevector(__s1_5, __s1_5, 3, 2, 1, 0); \ + float16x4_t __rev2_5; __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 3, 2, 1, 0); \ + float16x4_t __ret_5; \ + __ret_5 = __noswap_vfma_lane_f16(__rev0_5, -__rev1_5, __rev2_5, __p3_5); \ + __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 3, 2, 1, 0); \ + __ret_5; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsh_laneq_f16(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \ + float16_t __s0_6 = __p0_6; \ + float16_t __s1_6 = __p1_6; \ + float16x8_t __s2_6 = __p2_6; \ + float16_t __ret_6; \ + __ret_6 = vfmah_laneq_f16(__s0_6, -__s1_6, __s2_6, __p3_6); \ + __ret_6; \ +}) +#else +#define vfmsh_laneq_f16(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \ + float16_t __s0_7 = __p0_7; \ + float16_t __s1_7 = __p1_7; \ + float16x8_t __s2_7 = __p2_7; \ + float16x8_t __rev2_7; __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret_7; \ + __ret_7 = __noswap_vfmah_laneq_f16(__s0_7, -__s1_7, __rev2_7, __p3_7); \ + __ret_7; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_laneq_f16(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \ + float16x8_t __s0_8 = __p0_8; \ + float16x8_t __s1_8 = __p1_8; \ + float16x8_t __s2_8 = __p2_8; \ + float16x8_t __ret_8; \ + __ret_8 = vfmaq_laneq_f16(__s0_8, -__s1_8, __s2_8, __p3_8); \ + __ret_8; \ +}) +#else +#define vfmsq_laneq_f16(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \ + float16x8_t __s0_9 = __p0_9; \ + float16x8_t __s1_9 = __p1_9; \ + float16x8_t __s2_9 = __p2_9; \ + float16x8_t __rev0_9; __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1_9; __rev1_9 = __builtin_shufflevector(__s1_9, __s1_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev2_9; __rev2_9 = __builtin_shufflevector(__s2_9, __s2_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret_9; \ + __ret_9 = __noswap_vfmaq_laneq_f16(__rev0_9, -__rev1_9, __rev2_9, __p3_9); \ + __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_9; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_laneq_f16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \ + float16x4_t __s0_10 = __p0_10; \ + float16x4_t __s1_10 = __p1_10; \ + float16x8_t __s2_10 = __p2_10; \ + float16x4_t __ret_10; \ + __ret_10 = vfma_laneq_f16(__s0_10, -__s1_10, __s2_10, __p3_10); \ + __ret_10; \ +}) +#else +#define vfms_laneq_f16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \ + float16x4_t __s0_11 = __p0_11; \ + float16x4_t __s1_11 = __p1_11; \ + float16x8_t __s2_11 = __p2_11; \ + float16x4_t __rev0_11; __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 3, 2, 1, 0); \ + float16x4_t __rev1_11; __rev1_11 = __builtin_shufflevector(__s1_11, __s1_11, 3, 2, 1, 0); \ + float16x8_t __rev2_11; __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret_11; \ + __ret_11 = __noswap_vfma_laneq_f16(__rev0_11, -__rev1_11, __rev2_11, __p3_11); \ + __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 3, 2, 1, 0); \ + __ret_11; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __ret; \ + __ret = vfmaq_f16(__s0, -__s1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfmsq_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vfmaq_f16(__rev0, -__rev1, (float16x8_t) {__s2, __s2, __s2, __s2, __s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __ret; \ + __ret = vfma_f16(__s0, -__s1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret; \ +}) +#else +#define vfms_n_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vfma_f16(__rev0, -__rev1, (float16x4_t) {__s2, __s2, __s2, __s2}); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vmaxnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vmaxnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vmaxvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vmaxv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vminnmvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vminnmv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__s0); \ + __ret; \ +}) +#else +#define vminvq_f16(__p0) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminvq_f16((int8x16_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__s0); \ + __ret; \ +}) +#else +#define vminv_f16(__p0) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminv_f16((int8x8_t)__rev0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 * __p1; + return __ret; +} +#else +__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 * __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 * __p1; + return __ret; +} +#else +__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 * __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmul_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \ + __ret; \ +}) +#else +#define vmul_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ + __ret; \ +}) +#else +#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmul_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ + __ret; \ +}) +#else +#define vmul_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +__ai float16x8_t __noswap_vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmulx_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +__ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmulx_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = vmulxq_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vmulxq_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulx_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = vmulx_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulx_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vmulx_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = vmulxq_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vmulxq_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulx_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = vmulx_f16(__s0, __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2)); \ + __ret; \ +}) +#else +#define vmulx_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vmulx_f16(__rev0, __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2)); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = vmulxq_f16(__s0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \ + __ret; \ +}) +#else +#define vmulxq_n_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __noswap_vmulxq_f16(__rev0, (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulx_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = vmulx_f16(__s0, (float16x4_t) {__s1, __s1, __s1, __s1}); \ + __ret; \ +}) +#else +#define vmulx_n_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __noswap_vmulx_f16(__rev0, (float16x4_t) {__s1, __s1, __s1, __s1}); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vnegq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = -__p0; + return __ret; +} +#else +__ai float16x8_t vnegq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = -__rev0; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vneg_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = -__p0; + return __ret; +} +#else +__ai float16x4_t vneg_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = -__rev0; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrecpe_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrecpe_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrev64q_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4); + return __ret; +} +#else +__ai float16x8_t vrev64q_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrev64_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + return __ret; +} +#else +__ai float16x4_t vrev64_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrnd_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrnd_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndaq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndaq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrnda_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrnda_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndiq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndiq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndi_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndi_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndmq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndmq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndm_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndm_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndnq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndnq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndn_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndn_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndpq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndpq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndp_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndp_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndxq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndxq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndx_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndx_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vsqrtq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vsqrtq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vsqrt_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vsqrt_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vsqrt_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vsqrt_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 - __p1; + return __ret; +} +#else +__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 - __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 - __p1; + return __ret; +} +#else +__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 - __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14); + return __ret; +} +#else +__ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 2, 10, 4, 12, 6, 14); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 2, 6); + return __ret; +} +#else +__ai float16x4_t vtrn1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 2, 6); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 9, 3, 11, 5, 13, 7, 15); + return __ret; +} +#else +__ai float16x8_t vtrn2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 9, 3, 11, 5, 13, 7, 15); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 5, 3, 7); + return __ret; +} +#else +__ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 5, 3, 7); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14); + return __ret; +} +#else +__ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6, 8, 10, 12, 14); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6); + return __ret; +} +#else +__ai float16x4_t vuzp1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 2, 4, 6); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7, 9, 11, 13, 15); + return __ret; +} +#else +__ai float16x8_t vuzp2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7, 9, 11, 13, 15); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 1, 3, 5, 7); + return __ret; +} +#else +__ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 1, 3, 5, 7); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11); + return __ret; +} +#else +__ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 8, 1, 9, 2, 10, 3, 11); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 0, 4, 1, 5); + return __ret; +} +#else +__ai float16x4_t vzip1_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 0, 4, 1, 5); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 4, 12, 5, 13, 6, 14, 7, 15); + return __ret; +} +#else +__ai float16x8_t vzip2q_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 4, 12, 5, 13, 6, 14, 7, 15); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __builtin_shufflevector(__p0, __p1, 2, 6, 3, 7); + return __ret; +} +#else +__ai float16x4_t vzip2_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev1, 2, 6, 3, 7); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#endif #if defined(__ARM_FEATURE_QRDMX) #ifdef __LITTLE_ENDIAN__ __ai int32x4_t vqrdmlahq_s32(int32x4_t __p0, int32x4_t __p1, int32x4_t __p2) { @@ -44220,918 +47532,918 @@ __ai float64x2_t vcombine_f64(float64x1_t __p0, float64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p8(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \ - poly8x16_t __s0_0 = __p0_0; \ - poly8x8_t __s2_0 = __p2_0; \ - poly8x16_t __ret_0; \ - __ret_0 = vsetq_lane_p8(vget_lane_p8(__s2_0, __p3_0), __s0_0, __p1_0); \ - __ret_0; \ -}) -#else -#define vcopyq_lane_p8(__p0_1, __p1_1, __p2_1, __p3_1) __extension__ ({ \ - poly8x16_t __s0_1 = __p0_1; \ - poly8x8_t __s2_1 = __p2_1; \ - poly8x16_t __rev0_1; __rev0_1 = __builtin_shufflevector(__s0_1, __s0_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __rev2_1; __rev2_1 = __builtin_shufflevector(__s2_1, __s2_1, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __ret_1; \ - __ret_1 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_1, __p3_1), __rev0_1, __p1_1); \ - __ret_1 = __builtin_shufflevector(__ret_1, __ret_1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_1; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p16(__p0_2, __p1_2, __p2_2, __p3_2) __extension__ ({ \ - poly16x8_t __s0_2 = __p0_2; \ - poly16x4_t __s2_2 = __p2_2; \ - poly16x8_t __ret_2; \ - __ret_2 = vsetq_lane_p16(vget_lane_p16(__s2_2, __p3_2), __s0_2, __p1_2); \ - __ret_2; \ -}) -#else -#define vcopyq_lane_p16(__p0_3, __p1_3, __p2_3, __p3_3) __extension__ ({ \ - poly16x8_t __s0_3 = __p0_3; \ - poly16x4_t __s2_3 = __p2_3; \ - poly16x8_t __rev0_3; __rev0_3 = __builtin_shufflevector(__s0_3, __s0_3, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x4_t __rev2_3; __rev2_3 = __builtin_shufflevector(__s2_3, __s2_3, 3, 2, 1, 0); \ - poly16x8_t __ret_3; \ - __ret_3 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_3, __p3_3), __rev0_3, __p1_3); \ - __ret_3 = __builtin_shufflevector(__ret_3, __ret_3, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_3; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u8(__p0_4, __p1_4, __p2_4, __p3_4) __extension__ ({ \ - uint8x16_t __s0_4 = __p0_4; \ - uint8x8_t __s2_4 = __p2_4; \ - uint8x16_t __ret_4; \ - __ret_4 = vsetq_lane_u8(vget_lane_u8(__s2_4, __p3_4), __s0_4, __p1_4); \ - __ret_4; \ -}) -#else -#define vcopyq_lane_u8(__p0_5, __p1_5, __p2_5, __p3_5) __extension__ ({ \ - uint8x16_t __s0_5 = __p0_5; \ - uint8x8_t __s2_5 = __p2_5; \ - uint8x16_t __rev0_5; __rev0_5 = __builtin_shufflevector(__s0_5, __s0_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_5; __rev2_5 = __builtin_shufflevector(__s2_5, __s2_5, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_5; \ - __ret_5 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_5, __p3_5), __rev0_5, __p1_5); \ - __ret_5 = __builtin_shufflevector(__ret_5, __ret_5, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_5; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u32(__p0_6, __p1_6, __p2_6, __p3_6) __extension__ ({ \ - uint32x4_t __s0_6 = __p0_6; \ - uint32x2_t __s2_6 = __p2_6; \ - uint32x4_t __ret_6; \ - __ret_6 = vsetq_lane_u32(vget_lane_u32(__s2_6, __p3_6), __s0_6, __p1_6); \ - __ret_6; \ -}) -#else -#define vcopyq_lane_u32(__p0_7, __p1_7, __p2_7, __p3_7) __extension__ ({ \ - uint32x4_t __s0_7 = __p0_7; \ - uint32x2_t __s2_7 = __p2_7; \ - uint32x4_t __rev0_7; __rev0_7 = __builtin_shufflevector(__s0_7, __s0_7, 3, 2, 1, 0); \ - uint32x2_t __rev2_7; __rev2_7 = __builtin_shufflevector(__s2_7, __s2_7, 1, 0); \ - uint32x4_t __ret_7; \ - __ret_7 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_7, __p3_7), __rev0_7, __p1_7); \ - __ret_7 = __builtin_shufflevector(__ret_7, __ret_7, 3, 2, 1, 0); \ - __ret_7; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u64(__p0_8, __p1_8, __p2_8, __p3_8) __extension__ ({ \ - uint64x2_t __s0_8 = __p0_8; \ - uint64x1_t __s2_8 = __p2_8; \ - uint64x2_t __ret_8; \ - __ret_8 = vsetq_lane_u64(vget_lane_u64(__s2_8, __p3_8), __s0_8, __p1_8); \ - __ret_8; \ -}) -#else -#define vcopyq_lane_u64(__p0_9, __p1_9, __p2_9, __p3_9) __extension__ ({ \ - uint64x2_t __s0_9 = __p0_9; \ - uint64x1_t __s2_9 = __p2_9; \ - uint64x2_t __rev0_9; __rev0_9 = __builtin_shufflevector(__s0_9, __s0_9, 1, 0); \ - uint64x2_t __ret_9; \ - __ret_9 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_9, __p3_9), __rev0_9, __p1_9); \ - __ret_9 = __builtin_shufflevector(__ret_9, __ret_9, 1, 0); \ - __ret_9; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_u16(__p0_10, __p1_10, __p2_10, __p3_10) __extension__ ({ \ - uint16x8_t __s0_10 = __p0_10; \ - uint16x4_t __s2_10 = __p2_10; \ - uint16x8_t __ret_10; \ - __ret_10 = vsetq_lane_u16(vget_lane_u16(__s2_10, __p3_10), __s0_10, __p1_10); \ - __ret_10; \ -}) -#else -#define vcopyq_lane_u16(__p0_11, __p1_11, __p2_11, __p3_11) __extension__ ({ \ - uint16x8_t __s0_11 = __p0_11; \ - uint16x4_t __s2_11 = __p2_11; \ - uint16x8_t __rev0_11; __rev0_11 = __builtin_shufflevector(__s0_11, __s0_11, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x4_t __rev2_11; __rev2_11 = __builtin_shufflevector(__s2_11, __s2_11, 3, 2, 1, 0); \ - uint16x8_t __ret_11; \ - __ret_11 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_11, __p3_11), __rev0_11, __p1_11); \ - __ret_11 = __builtin_shufflevector(__ret_11, __ret_11, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_11; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \ - int8x16_t __s0_12 = __p0_12; \ - int8x8_t __s2_12 = __p2_12; \ - int8x16_t __ret_12; \ - __ret_12 = vsetq_lane_s8(vget_lane_s8(__s2_12, __p3_12), __s0_12, __p1_12); \ +#define vcopyq_lane_p8(__p0_12, __p1_12, __p2_12, __p3_12) __extension__ ({ \ + poly8x16_t __s0_12 = __p0_12; \ + poly8x8_t __s2_12 = __p2_12; \ + poly8x16_t __ret_12; \ + __ret_12 = vsetq_lane_p8(vget_lane_p8(__s2_12, __p3_12), __s0_12, __p1_12); \ __ret_12; \ }) #else -#define vcopyq_lane_s8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \ - int8x16_t __s0_13 = __p0_13; \ - int8x8_t __s2_13 = __p2_13; \ - int8x16_t __rev0_13; __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_13; __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_13; \ - __ret_13 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_13, __p3_13), __rev0_13, __p1_13); \ +#define vcopyq_lane_p8(__p0_13, __p1_13, __p2_13, __p3_13) __extension__ ({ \ + poly8x16_t __s0_13 = __p0_13; \ + poly8x8_t __s2_13 = __p2_13; \ + poly8x16_t __rev0_13; __rev0_13 = __builtin_shufflevector(__s0_13, __s0_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __rev2_13; __rev2_13 = __builtin_shufflevector(__s2_13, __s2_13, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __ret_13; \ + __ret_13 = __noswap_vsetq_lane_p8(__noswap_vget_lane_p8(__rev2_13, __p3_13), __rev0_13, __p1_13); \ __ret_13 = __builtin_shufflevector(__ret_13, __ret_13, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_13; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_f32(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \ - float32x4_t __s0_14 = __p0_14; \ - float32x2_t __s2_14 = __p2_14; \ - float32x4_t __ret_14; \ - __ret_14 = vsetq_lane_f32(vget_lane_f32(__s2_14, __p3_14), __s0_14, __p1_14); \ +#define vcopyq_lane_p16(__p0_14, __p1_14, __p2_14, __p3_14) __extension__ ({ \ + poly16x8_t __s0_14 = __p0_14; \ + poly16x4_t __s2_14 = __p2_14; \ + poly16x8_t __ret_14; \ + __ret_14 = vsetq_lane_p16(vget_lane_p16(__s2_14, __p3_14), __s0_14, __p1_14); \ __ret_14; \ }) #else -#define vcopyq_lane_f32(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \ - float32x4_t __s0_15 = __p0_15; \ - float32x2_t __s2_15 = __p2_15; \ - float32x4_t __rev0_15; __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 3, 2, 1, 0); \ - float32x2_t __rev2_15; __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 1, 0); \ - float32x4_t __ret_15; \ - __ret_15 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_15, __p3_15), __rev0_15, __p1_15); \ - __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 3, 2, 1, 0); \ +#define vcopyq_lane_p16(__p0_15, __p1_15, __p2_15, __p3_15) __extension__ ({ \ + poly16x8_t __s0_15 = __p0_15; \ + poly16x4_t __s2_15 = __p2_15; \ + poly16x8_t __rev0_15; __rev0_15 = __builtin_shufflevector(__s0_15, __s0_15, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x4_t __rev2_15; __rev2_15 = __builtin_shufflevector(__s2_15, __s2_15, 3, 2, 1, 0); \ + poly16x8_t __ret_15; \ + __ret_15 = __noswap_vsetq_lane_p16(__noswap_vget_lane_p16(__rev2_15, __p3_15), __rev0_15, __p1_15); \ + __ret_15 = __builtin_shufflevector(__ret_15, __ret_15, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_15; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s32(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \ - int32x4_t __s0_16 = __p0_16; \ - int32x2_t __s2_16 = __p2_16; \ - int32x4_t __ret_16; \ - __ret_16 = vsetq_lane_s32(vget_lane_s32(__s2_16, __p3_16), __s0_16, __p1_16); \ +#define vcopyq_lane_u8(__p0_16, __p1_16, __p2_16, __p3_16) __extension__ ({ \ + uint8x16_t __s0_16 = __p0_16; \ + uint8x8_t __s2_16 = __p2_16; \ + uint8x16_t __ret_16; \ + __ret_16 = vsetq_lane_u8(vget_lane_u8(__s2_16, __p3_16), __s0_16, __p1_16); \ __ret_16; \ }) #else -#define vcopyq_lane_s32(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \ - int32x4_t __s0_17 = __p0_17; \ - int32x2_t __s2_17 = __p2_17; \ - int32x4_t __rev0_17; __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 3, 2, 1, 0); \ - int32x2_t __rev2_17; __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 1, 0); \ - int32x4_t __ret_17; \ - __ret_17 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_17, __p3_17), __rev0_17, __p1_17); \ - __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 3, 2, 1, 0); \ +#define vcopyq_lane_u8(__p0_17, __p1_17, __p2_17, __p3_17) __extension__ ({ \ + uint8x16_t __s0_17 = __p0_17; \ + uint8x8_t __s2_17 = __p2_17; \ + uint8x16_t __rev0_17; __rev0_17 = __builtin_shufflevector(__s0_17, __s0_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_17; __rev2_17 = __builtin_shufflevector(__s2_17, __s2_17, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_17; \ + __ret_17 = __noswap_vsetq_lane_u8(__noswap_vget_lane_u8(__rev2_17, __p3_17), __rev0_17, __p1_17); \ + __ret_17 = __builtin_shufflevector(__ret_17, __ret_17, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_17; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s64(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \ - int64x2_t __s0_18 = __p0_18; \ - int64x1_t __s2_18 = __p2_18; \ - int64x2_t __ret_18; \ - __ret_18 = vsetq_lane_s64(vget_lane_s64(__s2_18, __p3_18), __s0_18, __p1_18); \ +#define vcopyq_lane_u32(__p0_18, __p1_18, __p2_18, __p3_18) __extension__ ({ \ + uint32x4_t __s0_18 = __p0_18; \ + uint32x2_t __s2_18 = __p2_18; \ + uint32x4_t __ret_18; \ + __ret_18 = vsetq_lane_u32(vget_lane_u32(__s2_18, __p3_18), __s0_18, __p1_18); \ __ret_18; \ }) #else -#define vcopyq_lane_s64(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \ - int64x2_t __s0_19 = __p0_19; \ - int64x1_t __s2_19 = __p2_19; \ - int64x2_t __rev0_19; __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 1, 0); \ - int64x2_t __ret_19; \ - __ret_19 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_19, __p3_19), __rev0_19, __p1_19); \ - __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 1, 0); \ +#define vcopyq_lane_u32(__p0_19, __p1_19, __p2_19, __p3_19) __extension__ ({ \ + uint32x4_t __s0_19 = __p0_19; \ + uint32x2_t __s2_19 = __p2_19; \ + uint32x4_t __rev0_19; __rev0_19 = __builtin_shufflevector(__s0_19, __s0_19, 3, 2, 1, 0); \ + uint32x2_t __rev2_19; __rev2_19 = __builtin_shufflevector(__s2_19, __s2_19, 1, 0); \ + uint32x4_t __ret_19; \ + __ret_19 = __noswap_vsetq_lane_u32(__noswap_vget_lane_u32(__rev2_19, __p3_19), __rev0_19, __p1_19); \ + __ret_19 = __builtin_shufflevector(__ret_19, __ret_19, 3, 2, 1, 0); \ __ret_19; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_s16(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \ - int16x8_t __s0_20 = __p0_20; \ - int16x4_t __s2_20 = __p2_20; \ - int16x8_t __ret_20; \ - __ret_20 = vsetq_lane_s16(vget_lane_s16(__s2_20, __p3_20), __s0_20, __p1_20); \ +#define vcopyq_lane_u64(__p0_20, __p1_20, __p2_20, __p3_20) __extension__ ({ \ + uint64x2_t __s0_20 = __p0_20; \ + uint64x1_t __s2_20 = __p2_20; \ + uint64x2_t __ret_20; \ + __ret_20 = vsetq_lane_u64(vget_lane_u64(__s2_20, __p3_20), __s0_20, __p1_20); \ __ret_20; \ }) #else -#define vcopyq_lane_s16(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \ - int16x8_t __s0_21 = __p0_21; \ - int16x4_t __s2_21 = __p2_21; \ - int16x8_t __rev0_21; __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x4_t __rev2_21; __rev2_21 = __builtin_shufflevector(__s2_21, __s2_21, 3, 2, 1, 0); \ - int16x8_t __ret_21; \ - __ret_21 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_21, __p3_21), __rev0_21, __p1_21); \ - __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_lane_u64(__p0_21, __p1_21, __p2_21, __p3_21) __extension__ ({ \ + uint64x2_t __s0_21 = __p0_21; \ + uint64x1_t __s2_21 = __p2_21; \ + uint64x2_t __rev0_21; __rev0_21 = __builtin_shufflevector(__s0_21, __s0_21, 1, 0); \ + uint64x2_t __ret_21; \ + __ret_21 = __noswap_vsetq_lane_u64(__noswap_vget_lane_u64(__s2_21, __p3_21), __rev0_21, __p1_21); \ + __ret_21 = __builtin_shufflevector(__ret_21, __ret_21, 1, 0); \ __ret_21; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_p8(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \ - poly8x8_t __s0_22 = __p0_22; \ - poly8x8_t __s2_22 = __p2_22; \ - poly8x8_t __ret_22; \ - __ret_22 = vset_lane_p8(vget_lane_p8(__s2_22, __p3_22), __s0_22, __p1_22); \ +#define vcopyq_lane_u16(__p0_22, __p1_22, __p2_22, __p3_22) __extension__ ({ \ + uint16x8_t __s0_22 = __p0_22; \ + uint16x4_t __s2_22 = __p2_22; \ + uint16x8_t __ret_22; \ + __ret_22 = vsetq_lane_u16(vget_lane_u16(__s2_22, __p3_22), __s0_22, __p1_22); \ __ret_22; \ }) #else -#define vcopy_lane_p8(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \ - poly8x8_t __s0_23 = __p0_23; \ - poly8x8_t __s2_23 = __p2_23; \ - poly8x8_t __rev0_23; __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __rev2_23; __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __ret_23; \ - __ret_23 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_23, __p3_23), __rev0_23, __p1_23); \ +#define vcopyq_lane_u16(__p0_23, __p1_23, __p2_23, __p3_23) __extension__ ({ \ + uint16x8_t __s0_23 = __p0_23; \ + uint16x4_t __s2_23 = __p2_23; \ + uint16x8_t __rev0_23; __rev0_23 = __builtin_shufflevector(__s0_23, __s0_23, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x4_t __rev2_23; __rev2_23 = __builtin_shufflevector(__s2_23, __s2_23, 3, 2, 1, 0); \ + uint16x8_t __ret_23; \ + __ret_23 = __noswap_vsetq_lane_u16(__noswap_vget_lane_u16(__rev2_23, __p3_23), __rev0_23, __p1_23); \ __ret_23 = __builtin_shufflevector(__ret_23, __ret_23, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_23; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_p16(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \ - poly16x4_t __s0_24 = __p0_24; \ - poly16x4_t __s2_24 = __p2_24; \ - poly16x4_t __ret_24; \ - __ret_24 = vset_lane_p16(vget_lane_p16(__s2_24, __p3_24), __s0_24, __p1_24); \ +#define vcopyq_lane_s8(__p0_24, __p1_24, __p2_24, __p3_24) __extension__ ({ \ + int8x16_t __s0_24 = __p0_24; \ + int8x8_t __s2_24 = __p2_24; \ + int8x16_t __ret_24; \ + __ret_24 = vsetq_lane_s8(vget_lane_s8(__s2_24, __p3_24), __s0_24, __p1_24); \ __ret_24; \ }) #else -#define vcopy_lane_p16(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \ - poly16x4_t __s0_25 = __p0_25; \ - poly16x4_t __s2_25 = __p2_25; \ - poly16x4_t __rev0_25; __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 3, 2, 1, 0); \ - poly16x4_t __rev2_25; __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 3, 2, 1, 0); \ - poly16x4_t __ret_25; \ - __ret_25 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_25, __p3_25), __rev0_25, __p1_25); \ - __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 3, 2, 1, 0); \ +#define vcopyq_lane_s8(__p0_25, __p1_25, __p2_25, __p3_25) __extension__ ({ \ + int8x16_t __s0_25 = __p0_25; \ + int8x8_t __s2_25 = __p2_25; \ + int8x16_t __rev0_25; __rev0_25 = __builtin_shufflevector(__s0_25, __s0_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_25; __rev2_25 = __builtin_shufflevector(__s2_25, __s2_25, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_25; \ + __ret_25 = __noswap_vsetq_lane_s8(__noswap_vget_lane_s8(__rev2_25, __p3_25), __rev0_25, __p1_25); \ + __ret_25 = __builtin_shufflevector(__ret_25, __ret_25, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_25; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u8(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \ - uint8x8_t __s0_26 = __p0_26; \ - uint8x8_t __s2_26 = __p2_26; \ - uint8x8_t __ret_26; \ - __ret_26 = vset_lane_u8(vget_lane_u8(__s2_26, __p3_26), __s0_26, __p1_26); \ +#define vcopyq_lane_f32(__p0_26, __p1_26, __p2_26, __p3_26) __extension__ ({ \ + float32x4_t __s0_26 = __p0_26; \ + float32x2_t __s2_26 = __p2_26; \ + float32x4_t __ret_26; \ + __ret_26 = vsetq_lane_f32(vget_lane_f32(__s2_26, __p3_26), __s0_26, __p1_26); \ __ret_26; \ }) #else -#define vcopy_lane_u8(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \ - uint8x8_t __s0_27 = __p0_27; \ - uint8x8_t __s2_27 = __p2_27; \ - uint8x8_t __rev0_27; __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __rev2_27; __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __ret_27; \ - __ret_27 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_27, __p3_27), __rev0_27, __p1_27); \ - __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_lane_f32(__p0_27, __p1_27, __p2_27, __p3_27) __extension__ ({ \ + float32x4_t __s0_27 = __p0_27; \ + float32x2_t __s2_27 = __p2_27; \ + float32x4_t __rev0_27; __rev0_27 = __builtin_shufflevector(__s0_27, __s0_27, 3, 2, 1, 0); \ + float32x2_t __rev2_27; __rev2_27 = __builtin_shufflevector(__s2_27, __s2_27, 1, 0); \ + float32x4_t __ret_27; \ + __ret_27 = __noswap_vsetq_lane_f32(__noswap_vget_lane_f32(__rev2_27, __p3_27), __rev0_27, __p1_27); \ + __ret_27 = __builtin_shufflevector(__ret_27, __ret_27, 3, 2, 1, 0); \ __ret_27; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \ - uint32x2_t __s0_28 = __p0_28; \ - uint32x2_t __s2_28 = __p2_28; \ - uint32x2_t __ret_28; \ - __ret_28 = vset_lane_u32(vget_lane_u32(__s2_28, __p3_28), __s0_28, __p1_28); \ +#define vcopyq_lane_s32(__p0_28, __p1_28, __p2_28, __p3_28) __extension__ ({ \ + int32x4_t __s0_28 = __p0_28; \ + int32x2_t __s2_28 = __p2_28; \ + int32x4_t __ret_28; \ + __ret_28 = vsetq_lane_s32(vget_lane_s32(__s2_28, __p3_28), __s0_28, __p1_28); \ __ret_28; \ }) #else -#define vcopy_lane_u32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \ - uint32x2_t __s0_29 = __p0_29; \ - uint32x2_t __s2_29 = __p2_29; \ - uint32x2_t __rev0_29; __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 1, 0); \ - uint32x2_t __rev2_29; __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \ - uint32x2_t __ret_29; \ - __ret_29 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_29, __p3_29), __rev0_29, __p1_29); \ - __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 1, 0); \ +#define vcopyq_lane_s32(__p0_29, __p1_29, __p2_29, __p3_29) __extension__ ({ \ + int32x4_t __s0_29 = __p0_29; \ + int32x2_t __s2_29 = __p2_29; \ + int32x4_t __rev0_29; __rev0_29 = __builtin_shufflevector(__s0_29, __s0_29, 3, 2, 1, 0); \ + int32x2_t __rev2_29; __rev2_29 = __builtin_shufflevector(__s2_29, __s2_29, 1, 0); \ + int32x4_t __ret_29; \ + __ret_29 = __noswap_vsetq_lane_s32(__noswap_vget_lane_s32(__rev2_29, __p3_29), __rev0_29, __p1_29); \ + __ret_29 = __builtin_shufflevector(__ret_29, __ret_29, 3, 2, 1, 0); \ __ret_29; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \ - uint64x1_t __s0_30 = __p0_30; \ - uint64x1_t __s2_30 = __p2_30; \ - uint64x1_t __ret_30; \ - __ret_30 = vset_lane_u64(vget_lane_u64(__s2_30, __p3_30), __s0_30, __p1_30); \ +#define vcopyq_lane_s64(__p0_30, __p1_30, __p2_30, __p3_30) __extension__ ({ \ + int64x2_t __s0_30 = __p0_30; \ + int64x1_t __s2_30 = __p2_30; \ + int64x2_t __ret_30; \ + __ret_30 = vsetq_lane_s64(vget_lane_s64(__s2_30, __p3_30), __s0_30, __p1_30); \ __ret_30; \ }) #else -#define vcopy_lane_u64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \ - uint64x1_t __s0_31 = __p0_31; \ - uint64x1_t __s2_31 = __p2_31; \ - uint64x1_t __ret_31; \ - __ret_31 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_31, __p3_31), __s0_31, __p1_31); \ +#define vcopyq_lane_s64(__p0_31, __p1_31, __p2_31, __p3_31) __extension__ ({ \ + int64x2_t __s0_31 = __p0_31; \ + int64x1_t __s2_31 = __p2_31; \ + int64x2_t __rev0_31; __rev0_31 = __builtin_shufflevector(__s0_31, __s0_31, 1, 0); \ + int64x2_t __ret_31; \ + __ret_31 = __noswap_vsetq_lane_s64(__noswap_vget_lane_s64(__s2_31, __p3_31), __rev0_31, __p1_31); \ + __ret_31 = __builtin_shufflevector(__ret_31, __ret_31, 1, 0); \ __ret_31; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_u16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \ - uint16x4_t __s0_32 = __p0_32; \ - uint16x4_t __s2_32 = __p2_32; \ - uint16x4_t __ret_32; \ - __ret_32 = vset_lane_u16(vget_lane_u16(__s2_32, __p3_32), __s0_32, __p1_32); \ +#define vcopyq_lane_s16(__p0_32, __p1_32, __p2_32, __p3_32) __extension__ ({ \ + int16x8_t __s0_32 = __p0_32; \ + int16x4_t __s2_32 = __p2_32; \ + int16x8_t __ret_32; \ + __ret_32 = vsetq_lane_s16(vget_lane_s16(__s2_32, __p3_32), __s0_32, __p1_32); \ __ret_32; \ }) #else -#define vcopy_lane_u16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \ - uint16x4_t __s0_33 = __p0_33; \ - uint16x4_t __s2_33 = __p2_33; \ - uint16x4_t __rev0_33; __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 3, 2, 1, 0); \ - uint16x4_t __rev2_33; __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \ - uint16x4_t __ret_33; \ - __ret_33 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_33, __p3_33), __rev0_33, __p1_33); \ - __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 3, 2, 1, 0); \ +#define vcopyq_lane_s16(__p0_33, __p1_33, __p2_33, __p3_33) __extension__ ({ \ + int16x8_t __s0_33 = __p0_33; \ + int16x4_t __s2_33 = __p2_33; \ + int16x8_t __rev0_33; __rev0_33 = __builtin_shufflevector(__s0_33, __s0_33, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x4_t __rev2_33; __rev2_33 = __builtin_shufflevector(__s2_33, __s2_33, 3, 2, 1, 0); \ + int16x8_t __ret_33; \ + __ret_33 = __noswap_vsetq_lane_s16(__noswap_vget_lane_s16(__rev2_33, __p3_33), __rev0_33, __p1_33); \ + __ret_33 = __builtin_shufflevector(__ret_33, __ret_33, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_33; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \ - int8x8_t __s0_34 = __p0_34; \ - int8x8_t __s2_34 = __p2_34; \ - int8x8_t __ret_34; \ - __ret_34 = vset_lane_s8(vget_lane_s8(__s2_34, __p3_34), __s0_34, __p1_34); \ +#define vcopy_lane_p8(__p0_34, __p1_34, __p2_34, __p3_34) __extension__ ({ \ + poly8x8_t __s0_34 = __p0_34; \ + poly8x8_t __s2_34 = __p2_34; \ + poly8x8_t __ret_34; \ + __ret_34 = vset_lane_p8(vget_lane_p8(__s2_34, __p3_34), __s0_34, __p1_34); \ __ret_34; \ }) #else -#define vcopy_lane_s8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \ - int8x8_t __s0_35 = __p0_35; \ - int8x8_t __s2_35 = __p2_35; \ - int8x8_t __rev0_35; __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __rev2_35; __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __ret_35; \ - __ret_35 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_35, __p3_35), __rev0_35, __p1_35); \ +#define vcopy_lane_p8(__p0_35, __p1_35, __p2_35, __p3_35) __extension__ ({ \ + poly8x8_t __s0_35 = __p0_35; \ + poly8x8_t __s2_35 = __p2_35; \ + poly8x8_t __rev0_35; __rev0_35 = __builtin_shufflevector(__s0_35, __s0_35, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __rev2_35; __rev2_35 = __builtin_shufflevector(__s2_35, __s2_35, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __ret_35; \ + __ret_35 = __noswap_vset_lane_p8(__noswap_vget_lane_p8(__rev2_35, __p3_35), __rev0_35, __p1_35); \ __ret_35 = __builtin_shufflevector(__ret_35, __ret_35, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_35; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_f32(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \ - float32x2_t __s0_36 = __p0_36; \ - float32x2_t __s2_36 = __p2_36; \ - float32x2_t __ret_36; \ - __ret_36 = vset_lane_f32(vget_lane_f32(__s2_36, __p3_36), __s0_36, __p1_36); \ +#define vcopy_lane_p16(__p0_36, __p1_36, __p2_36, __p3_36) __extension__ ({ \ + poly16x4_t __s0_36 = __p0_36; \ + poly16x4_t __s2_36 = __p2_36; \ + poly16x4_t __ret_36; \ + __ret_36 = vset_lane_p16(vget_lane_p16(__s2_36, __p3_36), __s0_36, __p1_36); \ __ret_36; \ }) #else -#define vcopy_lane_f32(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \ - float32x2_t __s0_37 = __p0_37; \ - float32x2_t __s2_37 = __p2_37; \ - float32x2_t __rev0_37; __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 1, 0); \ - float32x2_t __rev2_37; __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 1, 0); \ - float32x2_t __ret_37; \ - __ret_37 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_37, __p3_37), __rev0_37, __p1_37); \ - __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 1, 0); \ +#define vcopy_lane_p16(__p0_37, __p1_37, __p2_37, __p3_37) __extension__ ({ \ + poly16x4_t __s0_37 = __p0_37; \ + poly16x4_t __s2_37 = __p2_37; \ + poly16x4_t __rev0_37; __rev0_37 = __builtin_shufflevector(__s0_37, __s0_37, 3, 2, 1, 0); \ + poly16x4_t __rev2_37; __rev2_37 = __builtin_shufflevector(__s2_37, __s2_37, 3, 2, 1, 0); \ + poly16x4_t __ret_37; \ + __ret_37 = __noswap_vset_lane_p16(__noswap_vget_lane_p16(__rev2_37, __p3_37), __rev0_37, __p1_37); \ + __ret_37 = __builtin_shufflevector(__ret_37, __ret_37, 3, 2, 1, 0); \ __ret_37; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s32(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \ - int32x2_t __s0_38 = __p0_38; \ - int32x2_t __s2_38 = __p2_38; \ - int32x2_t __ret_38; \ - __ret_38 = vset_lane_s32(vget_lane_s32(__s2_38, __p3_38), __s0_38, __p1_38); \ +#define vcopy_lane_u8(__p0_38, __p1_38, __p2_38, __p3_38) __extension__ ({ \ + uint8x8_t __s0_38 = __p0_38; \ + uint8x8_t __s2_38 = __p2_38; \ + uint8x8_t __ret_38; \ + __ret_38 = vset_lane_u8(vget_lane_u8(__s2_38, __p3_38), __s0_38, __p1_38); \ __ret_38; \ }) #else -#define vcopy_lane_s32(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \ - int32x2_t __s0_39 = __p0_39; \ - int32x2_t __s2_39 = __p2_39; \ - int32x2_t __rev0_39; __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 1, 0); \ - int32x2_t __rev2_39; __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 1, 0); \ - int32x2_t __ret_39; \ - __ret_39 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_39, __p3_39), __rev0_39, __p1_39); \ - __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 1, 0); \ +#define vcopy_lane_u8(__p0_39, __p1_39, __p2_39, __p3_39) __extension__ ({ \ + uint8x8_t __s0_39 = __p0_39; \ + uint8x8_t __s2_39 = __p2_39; \ + uint8x8_t __rev0_39; __rev0_39 = __builtin_shufflevector(__s0_39, __s0_39, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2_39; __rev2_39 = __builtin_shufflevector(__s2_39, __s2_39, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __ret_39; \ + __ret_39 = __noswap_vset_lane_u8(__noswap_vget_lane_u8(__rev2_39, __p3_39), __rev0_39, __p1_39); \ + __ret_39 = __builtin_shufflevector(__ret_39, __ret_39, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_39; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s64(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \ - int64x1_t __s0_40 = __p0_40; \ - int64x1_t __s2_40 = __p2_40; \ - int64x1_t __ret_40; \ - __ret_40 = vset_lane_s64(vget_lane_s64(__s2_40, __p3_40), __s0_40, __p1_40); \ +#define vcopy_lane_u32(__p0_40, __p1_40, __p2_40, __p3_40) __extension__ ({ \ + uint32x2_t __s0_40 = __p0_40; \ + uint32x2_t __s2_40 = __p2_40; \ + uint32x2_t __ret_40; \ + __ret_40 = vset_lane_u32(vget_lane_u32(__s2_40, __p3_40), __s0_40, __p1_40); \ __ret_40; \ }) #else -#define vcopy_lane_s64(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \ - int64x1_t __s0_41 = __p0_41; \ - int64x1_t __s2_41 = __p2_41; \ - int64x1_t __ret_41; \ - __ret_41 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_41, __p3_41), __s0_41, __p1_41); \ +#define vcopy_lane_u32(__p0_41, __p1_41, __p2_41, __p3_41) __extension__ ({ \ + uint32x2_t __s0_41 = __p0_41; \ + uint32x2_t __s2_41 = __p2_41; \ + uint32x2_t __rev0_41; __rev0_41 = __builtin_shufflevector(__s0_41, __s0_41, 1, 0); \ + uint32x2_t __rev2_41; __rev2_41 = __builtin_shufflevector(__s2_41, __s2_41, 1, 0); \ + uint32x2_t __ret_41; \ + __ret_41 = __noswap_vset_lane_u32(__noswap_vget_lane_u32(__rev2_41, __p3_41), __rev0_41, __p1_41); \ + __ret_41 = __builtin_shufflevector(__ret_41, __ret_41, 1, 0); \ __ret_41; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_s16(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \ - int16x4_t __s0_42 = __p0_42; \ - int16x4_t __s2_42 = __p2_42; \ - int16x4_t __ret_42; \ - __ret_42 = vset_lane_s16(vget_lane_s16(__s2_42, __p3_42), __s0_42, __p1_42); \ +#define vcopy_lane_u64(__p0_42, __p1_42, __p2_42, __p3_42) __extension__ ({ \ + uint64x1_t __s0_42 = __p0_42; \ + uint64x1_t __s2_42 = __p2_42; \ + uint64x1_t __ret_42; \ + __ret_42 = vset_lane_u64(vget_lane_u64(__s2_42, __p3_42), __s0_42, __p1_42); \ __ret_42; \ }) #else -#define vcopy_lane_s16(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \ - int16x4_t __s0_43 = __p0_43; \ - int16x4_t __s2_43 = __p2_43; \ - int16x4_t __rev0_43; __rev0_43 = __builtin_shufflevector(__s0_43, __s0_43, 3, 2, 1, 0); \ - int16x4_t __rev2_43; __rev2_43 = __builtin_shufflevector(__s2_43, __s2_43, 3, 2, 1, 0); \ - int16x4_t __ret_43; \ - __ret_43 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_43, __p3_43), __rev0_43, __p1_43); \ - __ret_43 = __builtin_shufflevector(__ret_43, __ret_43, 3, 2, 1, 0); \ +#define vcopy_lane_u64(__p0_43, __p1_43, __p2_43, __p3_43) __extension__ ({ \ + uint64x1_t __s0_43 = __p0_43; \ + uint64x1_t __s2_43 = __p2_43; \ + uint64x1_t __ret_43; \ + __ret_43 = __noswap_vset_lane_u64(__noswap_vget_lane_u64(__s2_43, __p3_43), __s0_43, __p1_43); \ __ret_43; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p8(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \ - poly8x16_t __s0_44 = __p0_44; \ - poly8x16_t __s2_44 = __p2_44; \ - poly8x16_t __ret_44; \ - __ret_44 = vsetq_lane_p8(vgetq_lane_p8(__s2_44, __p3_44), __s0_44, __p1_44); \ +#define vcopy_lane_u16(__p0_44, __p1_44, __p2_44, __p3_44) __extension__ ({ \ + uint16x4_t __s0_44 = __p0_44; \ + uint16x4_t __s2_44 = __p2_44; \ + uint16x4_t __ret_44; \ + __ret_44 = vset_lane_u16(vget_lane_u16(__s2_44, __p3_44), __s0_44, __p1_44); \ __ret_44; \ }) #else -#define vcopyq_laneq_p8(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \ - poly8x16_t __s0_45 = __p0_45; \ - poly8x16_t __s2_45 = __p2_45; \ - poly8x16_t __rev0_45; __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __rev2_45; __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __ret_45; \ - __ret_45 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_45, __p3_45), __rev0_45, __p1_45); \ - __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopy_lane_u16(__p0_45, __p1_45, __p2_45, __p3_45) __extension__ ({ \ + uint16x4_t __s0_45 = __p0_45; \ + uint16x4_t __s2_45 = __p2_45; \ + uint16x4_t __rev0_45; __rev0_45 = __builtin_shufflevector(__s0_45, __s0_45, 3, 2, 1, 0); \ + uint16x4_t __rev2_45; __rev2_45 = __builtin_shufflevector(__s2_45, __s2_45, 3, 2, 1, 0); \ + uint16x4_t __ret_45; \ + __ret_45 = __noswap_vset_lane_u16(__noswap_vget_lane_u16(__rev2_45, __p3_45), __rev0_45, __p1_45); \ + __ret_45 = __builtin_shufflevector(__ret_45, __ret_45, 3, 2, 1, 0); \ __ret_45; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p16(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \ - poly16x8_t __s0_46 = __p0_46; \ - poly16x8_t __s2_46 = __p2_46; \ - poly16x8_t __ret_46; \ - __ret_46 = vsetq_lane_p16(vgetq_lane_p16(__s2_46, __p3_46), __s0_46, __p1_46); \ +#define vcopy_lane_s8(__p0_46, __p1_46, __p2_46, __p3_46) __extension__ ({ \ + int8x8_t __s0_46 = __p0_46; \ + int8x8_t __s2_46 = __p2_46; \ + int8x8_t __ret_46; \ + __ret_46 = vset_lane_s8(vget_lane_s8(__s2_46, __p3_46), __s0_46, __p1_46); \ __ret_46; \ }) #else -#define vcopyq_laneq_p16(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \ - poly16x8_t __s0_47 = __p0_47; \ - poly16x8_t __s2_47 = __p2_47; \ - poly16x8_t __rev0_47; __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x8_t __rev2_47; __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x8_t __ret_47; \ - __ret_47 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_47, __p3_47), __rev0_47, __p1_47); \ +#define vcopy_lane_s8(__p0_47, __p1_47, __p2_47, __p3_47) __extension__ ({ \ + int8x8_t __s0_47 = __p0_47; \ + int8x8_t __s2_47 = __p2_47; \ + int8x8_t __rev0_47; __rev0_47 = __builtin_shufflevector(__s0_47, __s0_47, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2_47; __rev2_47 = __builtin_shufflevector(__s2_47, __s2_47, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __ret_47; \ + __ret_47 = __noswap_vset_lane_s8(__noswap_vget_lane_s8(__rev2_47, __p3_47), __rev0_47, __p1_47); \ __ret_47 = __builtin_shufflevector(__ret_47, __ret_47, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_47; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u8(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \ - uint8x16_t __s0_48 = __p0_48; \ - uint8x16_t __s2_48 = __p2_48; \ - uint8x16_t __ret_48; \ - __ret_48 = vsetq_lane_u8(vgetq_lane_u8(__s2_48, __p3_48), __s0_48, __p1_48); \ +#define vcopy_lane_f32(__p0_48, __p1_48, __p2_48, __p3_48) __extension__ ({ \ + float32x2_t __s0_48 = __p0_48; \ + float32x2_t __s2_48 = __p2_48; \ + float32x2_t __ret_48; \ + __ret_48 = vset_lane_f32(vget_lane_f32(__s2_48, __p3_48), __s0_48, __p1_48); \ __ret_48; \ }) #else -#define vcopyq_laneq_u8(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \ - uint8x16_t __s0_49 = __p0_49; \ - uint8x16_t __s2_49 = __p2_49; \ - uint8x16_t __rev0_49; __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __rev2_49; __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_49; \ - __ret_49 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_49, __p3_49), __rev0_49, __p1_49); \ - __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopy_lane_f32(__p0_49, __p1_49, __p2_49, __p3_49) __extension__ ({ \ + float32x2_t __s0_49 = __p0_49; \ + float32x2_t __s2_49 = __p2_49; \ + float32x2_t __rev0_49; __rev0_49 = __builtin_shufflevector(__s0_49, __s0_49, 1, 0); \ + float32x2_t __rev2_49; __rev2_49 = __builtin_shufflevector(__s2_49, __s2_49, 1, 0); \ + float32x2_t __ret_49; \ + __ret_49 = __noswap_vset_lane_f32(__noswap_vget_lane_f32(__rev2_49, __p3_49), __rev0_49, __p1_49); \ + __ret_49 = __builtin_shufflevector(__ret_49, __ret_49, 1, 0); \ __ret_49; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \ - uint32x4_t __s0_50 = __p0_50; \ - uint32x4_t __s2_50 = __p2_50; \ - uint32x4_t __ret_50; \ - __ret_50 = vsetq_lane_u32(vgetq_lane_u32(__s2_50, __p3_50), __s0_50, __p1_50); \ +#define vcopy_lane_s32(__p0_50, __p1_50, __p2_50, __p3_50) __extension__ ({ \ + int32x2_t __s0_50 = __p0_50; \ + int32x2_t __s2_50 = __p2_50; \ + int32x2_t __ret_50; \ + __ret_50 = vset_lane_s32(vget_lane_s32(__s2_50, __p3_50), __s0_50, __p1_50); \ __ret_50; \ }) #else -#define vcopyq_laneq_u32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \ - uint32x4_t __s0_51 = __p0_51; \ - uint32x4_t __s2_51 = __p2_51; \ - uint32x4_t __rev0_51; __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 3, 2, 1, 0); \ - uint32x4_t __rev2_51; __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 3, 2, 1, 0); \ - uint32x4_t __ret_51; \ - __ret_51 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_51, __p3_51), __rev0_51, __p1_51); \ - __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 3, 2, 1, 0); \ +#define vcopy_lane_s32(__p0_51, __p1_51, __p2_51, __p3_51) __extension__ ({ \ + int32x2_t __s0_51 = __p0_51; \ + int32x2_t __s2_51 = __p2_51; \ + int32x2_t __rev0_51; __rev0_51 = __builtin_shufflevector(__s0_51, __s0_51, 1, 0); \ + int32x2_t __rev2_51; __rev2_51 = __builtin_shufflevector(__s2_51, __s2_51, 1, 0); \ + int32x2_t __ret_51; \ + __ret_51 = __noswap_vset_lane_s32(__noswap_vget_lane_s32(__rev2_51, __p3_51), __rev0_51, __p1_51); \ + __ret_51 = __builtin_shufflevector(__ret_51, __ret_51, 1, 0); \ __ret_51; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \ - uint64x2_t __s0_52 = __p0_52; \ - uint64x2_t __s2_52 = __p2_52; \ - uint64x2_t __ret_52; \ - __ret_52 = vsetq_lane_u64(vgetq_lane_u64(__s2_52, __p3_52), __s0_52, __p1_52); \ +#define vcopy_lane_s64(__p0_52, __p1_52, __p2_52, __p3_52) __extension__ ({ \ + int64x1_t __s0_52 = __p0_52; \ + int64x1_t __s2_52 = __p2_52; \ + int64x1_t __ret_52; \ + __ret_52 = vset_lane_s64(vget_lane_s64(__s2_52, __p3_52), __s0_52, __p1_52); \ __ret_52; \ }) #else -#define vcopyq_laneq_u64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \ - uint64x2_t __s0_53 = __p0_53; \ - uint64x2_t __s2_53 = __p2_53; \ - uint64x2_t __rev0_53; __rev0_53 = __builtin_shufflevector(__s0_53, __s0_53, 1, 0); \ - uint64x2_t __rev2_53; __rev2_53 = __builtin_shufflevector(__s2_53, __s2_53, 1, 0); \ - uint64x2_t __ret_53; \ - __ret_53 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_53, __p3_53), __rev0_53, __p1_53); \ - __ret_53 = __builtin_shufflevector(__ret_53, __ret_53, 1, 0); \ +#define vcopy_lane_s64(__p0_53, __p1_53, __p2_53, __p3_53) __extension__ ({ \ + int64x1_t __s0_53 = __p0_53; \ + int64x1_t __s2_53 = __p2_53; \ + int64x1_t __ret_53; \ + __ret_53 = __noswap_vset_lane_s64(__noswap_vget_lane_s64(__s2_53, __p3_53), __s0_53, __p1_53); \ __ret_53; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_u16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \ - uint16x8_t __s0_54 = __p0_54; \ - uint16x8_t __s2_54 = __p2_54; \ - uint16x8_t __ret_54; \ - __ret_54 = vsetq_lane_u16(vgetq_lane_u16(__s2_54, __p3_54), __s0_54, __p1_54); \ +#define vcopy_lane_s16(__p0_54, __p1_54, __p2_54, __p3_54) __extension__ ({ \ + int16x4_t __s0_54 = __p0_54; \ + int16x4_t __s2_54 = __p2_54; \ + int16x4_t __ret_54; \ + __ret_54 = vset_lane_s16(vget_lane_s16(__s2_54, __p3_54), __s0_54, __p1_54); \ __ret_54; \ }) #else -#define vcopyq_laneq_u16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \ - uint16x8_t __s0_55 = __p0_55; \ - uint16x8_t __s2_55 = __p2_55; \ - uint16x8_t __rev0_55; __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev2_55; __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __ret_55; \ - __ret_55 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_55, __p3_55), __rev0_55, __p1_55); \ - __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopy_lane_s16(__p0_55, __p1_55, __p2_55, __p3_55) __extension__ ({ \ + int16x4_t __s0_55 = __p0_55; \ + int16x4_t __s2_55 = __p2_55; \ + int16x4_t __rev0_55; __rev0_55 = __builtin_shufflevector(__s0_55, __s0_55, 3, 2, 1, 0); \ + int16x4_t __rev2_55; __rev2_55 = __builtin_shufflevector(__s2_55, __s2_55, 3, 2, 1, 0); \ + int16x4_t __ret_55; \ + __ret_55 = __noswap_vset_lane_s16(__noswap_vget_lane_s16(__rev2_55, __p3_55), __rev0_55, __p1_55); \ + __ret_55 = __builtin_shufflevector(__ret_55, __ret_55, 3, 2, 1, 0); \ __ret_55; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \ - int8x16_t __s0_56 = __p0_56; \ - int8x16_t __s2_56 = __p2_56; \ - int8x16_t __ret_56; \ - __ret_56 = vsetq_lane_s8(vgetq_lane_s8(__s2_56, __p3_56), __s0_56, __p1_56); \ +#define vcopyq_laneq_p8(__p0_56, __p1_56, __p2_56, __p3_56) __extension__ ({ \ + poly8x16_t __s0_56 = __p0_56; \ + poly8x16_t __s2_56 = __p2_56; \ + poly8x16_t __ret_56; \ + __ret_56 = vsetq_lane_p8(vgetq_lane_p8(__s2_56, __p3_56), __s0_56, __p1_56); \ __ret_56; \ }) #else -#define vcopyq_laneq_s8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \ - int8x16_t __s0_57 = __p0_57; \ - int8x16_t __s2_57 = __p2_57; \ - int8x16_t __rev0_57; __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __rev2_57; __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_57; \ - __ret_57 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_57, __p3_57), __rev0_57, __p1_57); \ +#define vcopyq_laneq_p8(__p0_57, __p1_57, __p2_57, __p3_57) __extension__ ({ \ + poly8x16_t __s0_57 = __p0_57; \ + poly8x16_t __s2_57 = __p2_57; \ + poly8x16_t __rev0_57; __rev0_57 = __builtin_shufflevector(__s0_57, __s0_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __rev2_57; __rev2_57 = __builtin_shufflevector(__s2_57, __s2_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __ret_57; \ + __ret_57 = __noswap_vsetq_lane_p8(__noswap_vgetq_lane_p8(__rev2_57, __p3_57), __rev0_57, __p1_57); \ __ret_57 = __builtin_shufflevector(__ret_57, __ret_57, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_57; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_f32(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \ - float32x4_t __s0_58 = __p0_58; \ - float32x4_t __s2_58 = __p2_58; \ - float32x4_t __ret_58; \ - __ret_58 = vsetq_lane_f32(vgetq_lane_f32(__s2_58, __p3_58), __s0_58, __p1_58); \ +#define vcopyq_laneq_p16(__p0_58, __p1_58, __p2_58, __p3_58) __extension__ ({ \ + poly16x8_t __s0_58 = __p0_58; \ + poly16x8_t __s2_58 = __p2_58; \ + poly16x8_t __ret_58; \ + __ret_58 = vsetq_lane_p16(vgetq_lane_p16(__s2_58, __p3_58), __s0_58, __p1_58); \ __ret_58; \ }) #else -#define vcopyq_laneq_f32(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \ - float32x4_t __s0_59 = __p0_59; \ - float32x4_t __s2_59 = __p2_59; \ - float32x4_t __rev0_59; __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 3, 2, 1, 0); \ - float32x4_t __rev2_59; __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 3, 2, 1, 0); \ - float32x4_t __ret_59; \ - __ret_59 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_59, __p3_59), __rev0_59, __p1_59); \ - __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 3, 2, 1, 0); \ +#define vcopyq_laneq_p16(__p0_59, __p1_59, __p2_59, __p3_59) __extension__ ({ \ + poly16x8_t __s0_59 = __p0_59; \ + poly16x8_t __s2_59 = __p2_59; \ + poly16x8_t __rev0_59; __rev0_59 = __builtin_shufflevector(__s0_59, __s0_59, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x8_t __rev2_59; __rev2_59 = __builtin_shufflevector(__s2_59, __s2_59, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x8_t __ret_59; \ + __ret_59 = __noswap_vsetq_lane_p16(__noswap_vgetq_lane_p16(__rev2_59, __p3_59), __rev0_59, __p1_59); \ + __ret_59 = __builtin_shufflevector(__ret_59, __ret_59, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_59; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s32(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \ - int32x4_t __s0_60 = __p0_60; \ - int32x4_t __s2_60 = __p2_60; \ - int32x4_t __ret_60; \ - __ret_60 = vsetq_lane_s32(vgetq_lane_s32(__s2_60, __p3_60), __s0_60, __p1_60); \ +#define vcopyq_laneq_u8(__p0_60, __p1_60, __p2_60, __p3_60) __extension__ ({ \ + uint8x16_t __s0_60 = __p0_60; \ + uint8x16_t __s2_60 = __p2_60; \ + uint8x16_t __ret_60; \ + __ret_60 = vsetq_lane_u8(vgetq_lane_u8(__s2_60, __p3_60), __s0_60, __p1_60); \ __ret_60; \ }) #else -#define vcopyq_laneq_s32(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \ - int32x4_t __s0_61 = __p0_61; \ - int32x4_t __s2_61 = __p2_61; \ - int32x4_t __rev0_61; __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 3, 2, 1, 0); \ - int32x4_t __rev2_61; __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 3, 2, 1, 0); \ - int32x4_t __ret_61; \ - __ret_61 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_61, __p3_61), __rev0_61, __p1_61); \ - __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 3, 2, 1, 0); \ +#define vcopyq_laneq_u8(__p0_61, __p1_61, __p2_61, __p3_61) __extension__ ({ \ + uint8x16_t __s0_61 = __p0_61; \ + uint8x16_t __s2_61 = __p2_61; \ + uint8x16_t __rev0_61; __rev0_61 = __builtin_shufflevector(__s0_61, __s0_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev2_61; __rev2_61 = __builtin_shufflevector(__s2_61, __s2_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_61; \ + __ret_61 = __noswap_vsetq_lane_u8(__noswap_vgetq_lane_u8(__rev2_61, __p3_61), __rev0_61, __p1_61); \ + __ret_61 = __builtin_shufflevector(__ret_61, __ret_61, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_61; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s64(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \ - int64x2_t __s0_62 = __p0_62; \ - int64x2_t __s2_62 = __p2_62; \ - int64x2_t __ret_62; \ - __ret_62 = vsetq_lane_s64(vgetq_lane_s64(__s2_62, __p3_62), __s0_62, __p1_62); \ +#define vcopyq_laneq_u32(__p0_62, __p1_62, __p2_62, __p3_62) __extension__ ({ \ + uint32x4_t __s0_62 = __p0_62; \ + uint32x4_t __s2_62 = __p2_62; \ + uint32x4_t __ret_62; \ + __ret_62 = vsetq_lane_u32(vgetq_lane_u32(__s2_62, __p3_62), __s0_62, __p1_62); \ __ret_62; \ }) #else -#define vcopyq_laneq_s64(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \ - int64x2_t __s0_63 = __p0_63; \ - int64x2_t __s2_63 = __p2_63; \ - int64x2_t __rev0_63; __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 1, 0); \ - int64x2_t __rev2_63; __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 1, 0); \ - int64x2_t __ret_63; \ - __ret_63 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_63, __p3_63), __rev0_63, __p1_63); \ - __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 1, 0); \ +#define vcopyq_laneq_u32(__p0_63, __p1_63, __p2_63, __p3_63) __extension__ ({ \ + uint32x4_t __s0_63 = __p0_63; \ + uint32x4_t __s2_63 = __p2_63; \ + uint32x4_t __rev0_63; __rev0_63 = __builtin_shufflevector(__s0_63, __s0_63, 3, 2, 1, 0); \ + uint32x4_t __rev2_63; __rev2_63 = __builtin_shufflevector(__s2_63, __s2_63, 3, 2, 1, 0); \ + uint32x4_t __ret_63; \ + __ret_63 = __noswap_vsetq_lane_u32(__noswap_vgetq_lane_u32(__rev2_63, __p3_63), __rev0_63, __p1_63); \ + __ret_63 = __builtin_shufflevector(__ret_63, __ret_63, 3, 2, 1, 0); \ __ret_63; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_s16(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \ - int16x8_t __s0_64 = __p0_64; \ - int16x8_t __s2_64 = __p2_64; \ - int16x8_t __ret_64; \ - __ret_64 = vsetq_lane_s16(vgetq_lane_s16(__s2_64, __p3_64), __s0_64, __p1_64); \ +#define vcopyq_laneq_u64(__p0_64, __p1_64, __p2_64, __p3_64) __extension__ ({ \ + uint64x2_t __s0_64 = __p0_64; \ + uint64x2_t __s2_64 = __p2_64; \ + uint64x2_t __ret_64; \ + __ret_64 = vsetq_lane_u64(vgetq_lane_u64(__s2_64, __p3_64), __s0_64, __p1_64); \ __ret_64; \ }) #else -#define vcopyq_laneq_s16(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \ - int16x8_t __s0_65 = __p0_65; \ - int16x8_t __s2_65 = __p2_65; \ - int16x8_t __rev0_65; __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev2_65; __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __ret_65; \ - __ret_65 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_65, __p3_65), __rev0_65, __p1_65); \ - __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_laneq_u64(__p0_65, __p1_65, __p2_65, __p3_65) __extension__ ({ \ + uint64x2_t __s0_65 = __p0_65; \ + uint64x2_t __s2_65 = __p2_65; \ + uint64x2_t __rev0_65; __rev0_65 = __builtin_shufflevector(__s0_65, __s0_65, 1, 0); \ + uint64x2_t __rev2_65; __rev2_65 = __builtin_shufflevector(__s2_65, __s2_65, 1, 0); \ + uint64x2_t __ret_65; \ + __ret_65 = __noswap_vsetq_lane_u64(__noswap_vgetq_lane_u64(__rev2_65, __p3_65), __rev0_65, __p1_65); \ + __ret_65 = __builtin_shufflevector(__ret_65, __ret_65, 1, 0); \ __ret_65; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p8(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \ - poly8x8_t __s0_66 = __p0_66; \ - poly8x16_t __s2_66 = __p2_66; \ - poly8x8_t __ret_66; \ - __ret_66 = vset_lane_p8(vgetq_lane_p8(__s2_66, __p3_66), __s0_66, __p1_66); \ +#define vcopyq_laneq_u16(__p0_66, __p1_66, __p2_66, __p3_66) __extension__ ({ \ + uint16x8_t __s0_66 = __p0_66; \ + uint16x8_t __s2_66 = __p2_66; \ + uint16x8_t __ret_66; \ + __ret_66 = vsetq_lane_u16(vgetq_lane_u16(__s2_66, __p3_66), __s0_66, __p1_66); \ __ret_66; \ }) #else -#define vcopy_laneq_p8(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \ - poly8x8_t __s0_67 = __p0_67; \ - poly8x16_t __s2_67 = __p2_67; \ - poly8x8_t __rev0_67; __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x16_t __rev2_67; __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly8x8_t __ret_67; \ - __ret_67 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_67, __p3_67), __rev0_67, __p1_67); \ +#define vcopyq_laneq_u16(__p0_67, __p1_67, __p2_67, __p3_67) __extension__ ({ \ + uint16x8_t __s0_67 = __p0_67; \ + uint16x8_t __s2_67 = __p2_67; \ + uint16x8_t __rev0_67; __rev0_67 = __builtin_shufflevector(__s0_67, __s0_67, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev2_67; __rev2_67 = __builtin_shufflevector(__s2_67, __s2_67, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __ret_67; \ + __ret_67 = __noswap_vsetq_lane_u16(__noswap_vgetq_lane_u16(__rev2_67, __p3_67), __rev0_67, __p1_67); \ __ret_67 = __builtin_shufflevector(__ret_67, __ret_67, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_67; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p16(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \ - poly16x4_t __s0_68 = __p0_68; \ - poly16x8_t __s2_68 = __p2_68; \ - poly16x4_t __ret_68; \ - __ret_68 = vset_lane_p16(vgetq_lane_p16(__s2_68, __p3_68), __s0_68, __p1_68); \ +#define vcopyq_laneq_s8(__p0_68, __p1_68, __p2_68, __p3_68) __extension__ ({ \ + int8x16_t __s0_68 = __p0_68; \ + int8x16_t __s2_68 = __p2_68; \ + int8x16_t __ret_68; \ + __ret_68 = vsetq_lane_s8(vgetq_lane_s8(__s2_68, __p3_68), __s0_68, __p1_68); \ __ret_68; \ }) #else -#define vcopy_laneq_p16(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \ - poly16x4_t __s0_69 = __p0_69; \ - poly16x8_t __s2_69 = __p2_69; \ - poly16x4_t __rev0_69; __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 3, 2, 1, 0); \ - poly16x8_t __rev2_69; __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 7, 6, 5, 4, 3, 2, 1, 0); \ - poly16x4_t __ret_69; \ - __ret_69 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_69, __p3_69), __rev0_69, __p1_69); \ - __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 3, 2, 1, 0); \ +#define vcopyq_laneq_s8(__p0_69, __p1_69, __p2_69, __p3_69) __extension__ ({ \ + int8x16_t __s0_69 = __p0_69; \ + int8x16_t __s2_69 = __p2_69; \ + int8x16_t __rev0_69; __rev0_69 = __builtin_shufflevector(__s0_69, __s0_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __rev2_69; __rev2_69 = __builtin_shufflevector(__s2_69, __s2_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_69; \ + __ret_69 = __noswap_vsetq_lane_s8(__noswap_vgetq_lane_s8(__rev2_69, __p3_69), __rev0_69, __p1_69); \ + __ret_69 = __builtin_shufflevector(__ret_69, __ret_69, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_69; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u8(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \ - uint8x8_t __s0_70 = __p0_70; \ - uint8x16_t __s2_70 = __p2_70; \ - uint8x8_t __ret_70; \ - __ret_70 = vset_lane_u8(vgetq_lane_u8(__s2_70, __p3_70), __s0_70, __p1_70); \ +#define vcopyq_laneq_f32(__p0_70, __p1_70, __p2_70, __p3_70) __extension__ ({ \ + float32x4_t __s0_70 = __p0_70; \ + float32x4_t __s2_70 = __p2_70; \ + float32x4_t __ret_70; \ + __ret_70 = vsetq_lane_f32(vgetq_lane_f32(__s2_70, __p3_70), __s0_70, __p1_70); \ __ret_70; \ }) #else -#define vcopy_laneq_u8(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \ - uint8x8_t __s0_71 = __p0_71; \ - uint8x16_t __s2_71 = __p2_71; \ - uint8x8_t __rev0_71; __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __rev2_71; __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x8_t __ret_71; \ - __ret_71 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_71, __p3_71), __rev0_71, __p1_71); \ - __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 7, 6, 5, 4, 3, 2, 1, 0); \ +#define vcopyq_laneq_f32(__p0_71, __p1_71, __p2_71, __p3_71) __extension__ ({ \ + float32x4_t __s0_71 = __p0_71; \ + float32x4_t __s2_71 = __p2_71; \ + float32x4_t __rev0_71; __rev0_71 = __builtin_shufflevector(__s0_71, __s0_71, 3, 2, 1, 0); \ + float32x4_t __rev2_71; __rev2_71 = __builtin_shufflevector(__s2_71, __s2_71, 3, 2, 1, 0); \ + float32x4_t __ret_71; \ + __ret_71 = __noswap_vsetq_lane_f32(__noswap_vgetq_lane_f32(__rev2_71, __p3_71), __rev0_71, __p1_71); \ + __ret_71 = __builtin_shufflevector(__ret_71, __ret_71, 3, 2, 1, 0); \ __ret_71; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \ - uint32x2_t __s0_72 = __p0_72; \ - uint32x4_t __s2_72 = __p2_72; \ - uint32x2_t __ret_72; \ - __ret_72 = vset_lane_u32(vgetq_lane_u32(__s2_72, __p3_72), __s0_72, __p1_72); \ +#define vcopyq_laneq_s32(__p0_72, __p1_72, __p2_72, __p3_72) __extension__ ({ \ + int32x4_t __s0_72 = __p0_72; \ + int32x4_t __s2_72 = __p2_72; \ + int32x4_t __ret_72; \ + __ret_72 = vsetq_lane_s32(vgetq_lane_s32(__s2_72, __p3_72), __s0_72, __p1_72); \ __ret_72; \ }) #else -#define vcopy_laneq_u32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \ - uint32x2_t __s0_73 = __p0_73; \ - uint32x4_t __s2_73 = __p2_73; \ - uint32x2_t __rev0_73; __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 1, 0); \ - uint32x4_t __rev2_73; __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \ - uint32x2_t __ret_73; \ - __ret_73 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_73, __p3_73), __rev0_73, __p1_73); \ - __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 1, 0); \ +#define vcopyq_laneq_s32(__p0_73, __p1_73, __p2_73, __p3_73) __extension__ ({ \ + int32x4_t __s0_73 = __p0_73; \ + int32x4_t __s2_73 = __p2_73; \ + int32x4_t __rev0_73; __rev0_73 = __builtin_shufflevector(__s0_73, __s0_73, 3, 2, 1, 0); \ + int32x4_t __rev2_73; __rev2_73 = __builtin_shufflevector(__s2_73, __s2_73, 3, 2, 1, 0); \ + int32x4_t __ret_73; \ + __ret_73 = __noswap_vsetq_lane_s32(__noswap_vgetq_lane_s32(__rev2_73, __p3_73), __rev0_73, __p1_73); \ + __ret_73 = __builtin_shufflevector(__ret_73, __ret_73, 3, 2, 1, 0); \ __ret_73; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \ - uint64x1_t __s0_74 = __p0_74; \ - uint64x2_t __s2_74 = __p2_74; \ - uint64x1_t __ret_74; \ - __ret_74 = vset_lane_u64(vgetq_lane_u64(__s2_74, __p3_74), __s0_74, __p1_74); \ +#define vcopyq_laneq_s64(__p0_74, __p1_74, __p2_74, __p3_74) __extension__ ({ \ + int64x2_t __s0_74 = __p0_74; \ + int64x2_t __s2_74 = __p2_74; \ + int64x2_t __ret_74; \ + __ret_74 = vsetq_lane_s64(vgetq_lane_s64(__s2_74, __p3_74), __s0_74, __p1_74); \ __ret_74; \ }) #else -#define vcopy_laneq_u64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \ - uint64x1_t __s0_75 = __p0_75; \ - uint64x2_t __s2_75 = __p2_75; \ - uint64x2_t __rev2_75; __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \ - uint64x1_t __ret_75; \ - __ret_75 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_75, __p3_75), __s0_75, __p1_75); \ +#define vcopyq_laneq_s64(__p0_75, __p1_75, __p2_75, __p3_75) __extension__ ({ \ + int64x2_t __s0_75 = __p0_75; \ + int64x2_t __s2_75 = __p2_75; \ + int64x2_t __rev0_75; __rev0_75 = __builtin_shufflevector(__s0_75, __s0_75, 1, 0); \ + int64x2_t __rev2_75; __rev2_75 = __builtin_shufflevector(__s2_75, __s2_75, 1, 0); \ + int64x2_t __ret_75; \ + __ret_75 = __noswap_vsetq_lane_s64(__noswap_vgetq_lane_s64(__rev2_75, __p3_75), __rev0_75, __p1_75); \ + __ret_75 = __builtin_shufflevector(__ret_75, __ret_75, 1, 0); \ __ret_75; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_u16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \ - uint16x4_t __s0_76 = __p0_76; \ - uint16x8_t __s2_76 = __p2_76; \ - uint16x4_t __ret_76; \ - __ret_76 = vset_lane_u16(vgetq_lane_u16(__s2_76, __p3_76), __s0_76, __p1_76); \ +#define vcopyq_laneq_s16(__p0_76, __p1_76, __p2_76, __p3_76) __extension__ ({ \ + int16x8_t __s0_76 = __p0_76; \ + int16x8_t __s2_76 = __p2_76; \ + int16x8_t __ret_76; \ + __ret_76 = vsetq_lane_s16(vgetq_lane_s16(__s2_76, __p3_76), __s0_76, __p1_76); \ __ret_76; \ }) #else -#define vcopy_laneq_u16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \ - uint16x4_t __s0_77 = __p0_77; \ - uint16x8_t __s2_77 = __p2_77; \ - uint16x4_t __rev0_77; __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 3, 2, 1, 0); \ - uint16x8_t __rev2_77; __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x4_t __ret_77; \ - __ret_77 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_77, __p3_77), __rev0_77, __p1_77); \ - __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 3, 2, 1, 0); \ +#define vcopyq_laneq_s16(__p0_77, __p1_77, __p2_77, __p3_77) __extension__ ({ \ + int16x8_t __s0_77 = __p0_77; \ + int16x8_t __s2_77 = __p2_77; \ + int16x8_t __rev0_77; __rev0_77 = __builtin_shufflevector(__s0_77, __s0_77, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev2_77; __rev2_77 = __builtin_shufflevector(__s2_77, __s2_77, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __ret_77; \ + __ret_77 = __noswap_vsetq_lane_s16(__noswap_vgetq_lane_s16(__rev2_77, __p3_77), __rev0_77, __p1_77); \ + __ret_77 = __builtin_shufflevector(__ret_77, __ret_77, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_77; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \ - int8x8_t __s0_78 = __p0_78; \ - int8x16_t __s2_78 = __p2_78; \ - int8x8_t __ret_78; \ - __ret_78 = vset_lane_s8(vgetq_lane_s8(__s2_78, __p3_78), __s0_78, __p1_78); \ +#define vcopy_laneq_p8(__p0_78, __p1_78, __p2_78, __p3_78) __extension__ ({ \ + poly8x8_t __s0_78 = __p0_78; \ + poly8x16_t __s2_78 = __p2_78; \ + poly8x8_t __ret_78; \ + __ret_78 = vset_lane_p8(vgetq_lane_p8(__s2_78, __p3_78), __s0_78, __p1_78); \ __ret_78; \ }) #else -#define vcopy_laneq_s8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \ - int8x8_t __s0_79 = __p0_79; \ - int8x16_t __s2_79 = __p2_79; \ - int8x8_t __rev0_79; __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __rev2_79; __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x8_t __ret_79; \ - __ret_79 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_79, __p3_79), __rev0_79, __p1_79); \ +#define vcopy_laneq_p8(__p0_79, __p1_79, __p2_79, __p3_79) __extension__ ({ \ + poly8x8_t __s0_79 = __p0_79; \ + poly8x16_t __s2_79 = __p2_79; \ + poly8x8_t __rev0_79; __rev0_79 = __builtin_shufflevector(__s0_79, __s0_79, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x16_t __rev2_79; __rev2_79 = __builtin_shufflevector(__s2_79, __s2_79, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly8x8_t __ret_79; \ + __ret_79 = __noswap_vset_lane_p8(__noswap_vgetq_lane_p8(__rev2_79, __p3_79), __rev0_79, __p1_79); \ __ret_79 = __builtin_shufflevector(__ret_79, __ret_79, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_79; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_f32(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \ - float32x2_t __s0_80 = __p0_80; \ - float32x4_t __s2_80 = __p2_80; \ - float32x2_t __ret_80; \ - __ret_80 = vset_lane_f32(vgetq_lane_f32(__s2_80, __p3_80), __s0_80, __p1_80); \ +#define vcopy_laneq_p16(__p0_80, __p1_80, __p2_80, __p3_80) __extension__ ({ \ + poly16x4_t __s0_80 = __p0_80; \ + poly16x8_t __s2_80 = __p2_80; \ + poly16x4_t __ret_80; \ + __ret_80 = vset_lane_p16(vgetq_lane_p16(__s2_80, __p3_80), __s0_80, __p1_80); \ __ret_80; \ }) #else -#define vcopy_laneq_f32(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \ - float32x2_t __s0_81 = __p0_81; \ - float32x4_t __s2_81 = __p2_81; \ - float32x2_t __rev0_81; __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 1, 0); \ - float32x4_t __rev2_81; __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 3, 2, 1, 0); \ - float32x2_t __ret_81; \ - __ret_81 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_81, __p3_81), __rev0_81, __p1_81); \ - __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 1, 0); \ +#define vcopy_laneq_p16(__p0_81, __p1_81, __p2_81, __p3_81) __extension__ ({ \ + poly16x4_t __s0_81 = __p0_81; \ + poly16x8_t __s2_81 = __p2_81; \ + poly16x4_t __rev0_81; __rev0_81 = __builtin_shufflevector(__s0_81, __s0_81, 3, 2, 1, 0); \ + poly16x8_t __rev2_81; __rev2_81 = __builtin_shufflevector(__s2_81, __s2_81, 7, 6, 5, 4, 3, 2, 1, 0); \ + poly16x4_t __ret_81; \ + __ret_81 = __noswap_vset_lane_p16(__noswap_vgetq_lane_p16(__rev2_81, __p3_81), __rev0_81, __p1_81); \ + __ret_81 = __builtin_shufflevector(__ret_81, __ret_81, 3, 2, 1, 0); \ __ret_81; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s32(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \ - int32x2_t __s0_82 = __p0_82; \ - int32x4_t __s2_82 = __p2_82; \ - int32x2_t __ret_82; \ - __ret_82 = vset_lane_s32(vgetq_lane_s32(__s2_82, __p3_82), __s0_82, __p1_82); \ +#define vcopy_laneq_u8(__p0_82, __p1_82, __p2_82, __p3_82) __extension__ ({ \ + uint8x8_t __s0_82 = __p0_82; \ + uint8x16_t __s2_82 = __p2_82; \ + uint8x8_t __ret_82; \ + __ret_82 = vset_lane_u8(vgetq_lane_u8(__s2_82, __p3_82), __s0_82, __p1_82); \ __ret_82; \ }) #else -#define vcopy_laneq_s32(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \ - int32x2_t __s0_83 = __p0_83; \ - int32x4_t __s2_83 = __p2_83; \ - int32x2_t __rev0_83; __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 1, 0); \ - int32x4_t __rev2_83; __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 3, 2, 1, 0); \ - int32x2_t __ret_83; \ - __ret_83 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_83, __p3_83), __rev0_83, __p1_83); \ - __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 1, 0); \ +#define vcopy_laneq_u8(__p0_83, __p1_83, __p2_83, __p3_83) __extension__ ({ \ + uint8x8_t __s0_83 = __p0_83; \ + uint8x16_t __s2_83 = __p2_83; \ + uint8x8_t __rev0_83; __rev0_83 = __builtin_shufflevector(__s0_83, __s0_83, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev2_83; __rev2_83 = __builtin_shufflevector(__s2_83, __s2_83, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __ret_83; \ + __ret_83 = __noswap_vset_lane_u8(__noswap_vgetq_lane_u8(__rev2_83, __p3_83), __rev0_83, __p1_83); \ + __ret_83 = __builtin_shufflevector(__ret_83, __ret_83, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret_83; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s64(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \ - int64x1_t __s0_84 = __p0_84; \ - int64x2_t __s2_84 = __p2_84; \ - int64x1_t __ret_84; \ - __ret_84 = vset_lane_s64(vgetq_lane_s64(__s2_84, __p3_84), __s0_84, __p1_84); \ +#define vcopy_laneq_u32(__p0_84, __p1_84, __p2_84, __p3_84) __extension__ ({ \ + uint32x2_t __s0_84 = __p0_84; \ + uint32x4_t __s2_84 = __p2_84; \ + uint32x2_t __ret_84; \ + __ret_84 = vset_lane_u32(vgetq_lane_u32(__s2_84, __p3_84), __s0_84, __p1_84); \ __ret_84; \ }) #else -#define vcopy_laneq_s64(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \ - int64x1_t __s0_85 = __p0_85; \ - int64x2_t __s2_85 = __p2_85; \ - int64x2_t __rev2_85; __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 1, 0); \ - int64x1_t __ret_85; \ - __ret_85 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_85, __p3_85), __s0_85, __p1_85); \ +#define vcopy_laneq_u32(__p0_85, __p1_85, __p2_85, __p3_85) __extension__ ({ \ + uint32x2_t __s0_85 = __p0_85; \ + uint32x4_t __s2_85 = __p2_85; \ + uint32x2_t __rev0_85; __rev0_85 = __builtin_shufflevector(__s0_85, __s0_85, 1, 0); \ + uint32x4_t __rev2_85; __rev2_85 = __builtin_shufflevector(__s2_85, __s2_85, 3, 2, 1, 0); \ + uint32x2_t __ret_85; \ + __ret_85 = __noswap_vset_lane_u32(__noswap_vgetq_lane_u32(__rev2_85, __p3_85), __rev0_85, __p1_85); \ + __ret_85 = __builtin_shufflevector(__ret_85, __ret_85, 1, 0); \ __ret_85; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_s16(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \ - int16x4_t __s0_86 = __p0_86; \ - int16x8_t __s2_86 = __p2_86; \ - int16x4_t __ret_86; \ - __ret_86 = vset_lane_s16(vgetq_lane_s16(__s2_86, __p3_86), __s0_86, __p1_86); \ +#define vcopy_laneq_u64(__p0_86, __p1_86, __p2_86, __p3_86) __extension__ ({ \ + uint64x1_t __s0_86 = __p0_86; \ + uint64x2_t __s2_86 = __p2_86; \ + uint64x1_t __ret_86; \ + __ret_86 = vset_lane_u64(vgetq_lane_u64(__s2_86, __p3_86), __s0_86, __p1_86); \ __ret_86; \ }) #else -#define vcopy_laneq_s16(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \ - int16x4_t __s0_87 = __p0_87; \ - int16x8_t __s2_87 = __p2_87; \ - int16x4_t __rev0_87; __rev0_87 = __builtin_shufflevector(__s0_87, __s0_87, 3, 2, 1, 0); \ - int16x8_t __rev2_87; __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x4_t __ret_87; \ - __ret_87 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_87, __p3_87), __rev0_87, __p1_87); \ - __ret_87 = __builtin_shufflevector(__ret_87, __ret_87, 3, 2, 1, 0); \ +#define vcopy_laneq_u64(__p0_87, __p1_87, __p2_87, __p3_87) __extension__ ({ \ + uint64x1_t __s0_87 = __p0_87; \ + uint64x2_t __s2_87 = __p2_87; \ + uint64x2_t __rev2_87; __rev2_87 = __builtin_shufflevector(__s2_87, __s2_87, 1, 0); \ + uint64x1_t __ret_87; \ + __ret_87 = __noswap_vset_lane_u64(__noswap_vgetq_lane_u64(__rev2_87, __p3_87), __s0_87, __p1_87); \ __ret_87; \ }) #endif #ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_u16(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \ + uint16x4_t __s0_88 = __p0_88; \ + uint16x8_t __s2_88 = __p2_88; \ + uint16x4_t __ret_88; \ + __ret_88 = vset_lane_u16(vgetq_lane_u16(__s2_88, __p3_88), __s0_88, __p1_88); \ + __ret_88; \ +}) +#else +#define vcopy_laneq_u16(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \ + uint16x4_t __s0_89 = __p0_89; \ + uint16x8_t __s2_89 = __p2_89; \ + uint16x4_t __rev0_89; __rev0_89 = __builtin_shufflevector(__s0_89, __s0_89, 3, 2, 1, 0); \ + uint16x8_t __rev2_89; __rev2_89 = __builtin_shufflevector(__s2_89, __s2_89, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x4_t __ret_89; \ + __ret_89 = __noswap_vset_lane_u16(__noswap_vgetq_lane_u16(__rev2_89, __p3_89), __rev0_89, __p1_89); \ + __ret_89 = __builtin_shufflevector(__ret_89, __ret_89, 3, 2, 1, 0); \ + __ret_89; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s8(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \ + int8x8_t __s0_90 = __p0_90; \ + int8x16_t __s2_90 = __p2_90; \ + int8x8_t __ret_90; \ + __ret_90 = vset_lane_s8(vgetq_lane_s8(__s2_90, __p3_90), __s0_90, __p1_90); \ + __ret_90; \ +}) +#else +#define vcopy_laneq_s8(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \ + int8x8_t __s0_91 = __p0_91; \ + int8x16_t __s2_91 = __p2_91; \ + int8x8_t __rev0_91; __rev0_91 = __builtin_shufflevector(__s0_91, __s0_91, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __rev2_91; __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __ret_91; \ + __ret_91 = __noswap_vset_lane_s8(__noswap_vgetq_lane_s8(__rev2_91, __p3_91), __rev0_91, __p1_91); \ + __ret_91 = __builtin_shufflevector(__ret_91, __ret_91, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_91; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_f32(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \ + float32x2_t __s0_92 = __p0_92; \ + float32x4_t __s2_92 = __p2_92; \ + float32x2_t __ret_92; \ + __ret_92 = vset_lane_f32(vgetq_lane_f32(__s2_92, __p3_92), __s0_92, __p1_92); \ + __ret_92; \ +}) +#else +#define vcopy_laneq_f32(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \ + float32x2_t __s0_93 = __p0_93; \ + float32x4_t __s2_93 = __p2_93; \ + float32x2_t __rev0_93; __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \ + float32x4_t __rev2_93; __rev2_93 = __builtin_shufflevector(__s2_93, __s2_93, 3, 2, 1, 0); \ + float32x2_t __ret_93; \ + __ret_93 = __noswap_vset_lane_f32(__noswap_vgetq_lane_f32(__rev2_93, __p3_93), __rev0_93, __p1_93); \ + __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \ + __ret_93; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \ + int32x2_t __s0_94 = __p0_94; \ + int32x4_t __s2_94 = __p2_94; \ + int32x2_t __ret_94; \ + __ret_94 = vset_lane_s32(vgetq_lane_s32(__s2_94, __p3_94), __s0_94, __p1_94); \ + __ret_94; \ +}) +#else +#define vcopy_laneq_s32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \ + int32x2_t __s0_95 = __p0_95; \ + int32x4_t __s2_95 = __p2_95; \ + int32x2_t __rev0_95; __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 1, 0); \ + int32x4_t __rev2_95; __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 3, 2, 1, 0); \ + int32x2_t __ret_95; \ + __ret_95 = __noswap_vset_lane_s32(__noswap_vgetq_lane_s32(__rev2_95, __p3_95), __rev0_95, __p1_95); \ + __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 1, 0); \ + __ret_95; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \ + int64x1_t __s0_96 = __p0_96; \ + int64x2_t __s2_96 = __p2_96; \ + int64x1_t __ret_96; \ + __ret_96 = vset_lane_s64(vgetq_lane_s64(__s2_96, __p3_96), __s0_96, __p1_96); \ + __ret_96; \ +}) +#else +#define vcopy_laneq_s64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \ + int64x1_t __s0_97 = __p0_97; \ + int64x2_t __s2_97 = __p2_97; \ + int64x2_t __rev2_97; __rev2_97 = __builtin_shufflevector(__s2_97, __s2_97, 1, 0); \ + int64x1_t __ret_97; \ + __ret_97 = __noswap_vset_lane_s64(__noswap_vgetq_lane_s64(__rev2_97, __p3_97), __s0_97, __p1_97); \ + __ret_97; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcopy_laneq_s16(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \ + int16x4_t __s0_98 = __p0_98; \ + int16x8_t __s2_98 = __p2_98; \ + int16x4_t __ret_98; \ + __ret_98 = vset_lane_s16(vgetq_lane_s16(__s2_98, __p3_98), __s0_98, __p1_98); \ + __ret_98; \ +}) +#else +#define vcopy_laneq_s16(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \ + int16x4_t __s0_99 = __p0_99; \ + int16x8_t __s2_99 = __p2_99; \ + int16x4_t __rev0_99; __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 3, 2, 1, 0); \ + int16x8_t __rev2_99; __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x4_t __ret_99; \ + __ret_99 = __noswap_vset_lane_s16(__noswap_vgetq_lane_s16(__rev2_99, __p3_99), __rev0_99, __p1_99); \ + __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 3, 2, 1, 0); \ + __ret_99; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ __ai poly64x1_t vcreate_p64(uint64_t __p0) { poly64x1_t __ret; __ret = (poly64x1_t)(__p0); @@ -47736,273 +51048,273 @@ __ai float64x1_t vfms_f64(float64x1_t __p0, float64x1_t __p1, float64x1_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmsd_lane_f64(__p0_88, __p1_88, __p2_88, __p3_88) __extension__ ({ \ - float64_t __s0_88 = __p0_88; \ - float64_t __s1_88 = __p1_88; \ - float64x1_t __s2_88 = __p2_88; \ - float64_t __ret_88; \ - __ret_88 = vfmad_lane_f64(__s0_88, -__s1_88, __s2_88, __p3_88); \ - __ret_88; \ -}) -#else -#define vfmsd_lane_f64(__p0_89, __p1_89, __p2_89, __p3_89) __extension__ ({ \ - float64_t __s0_89 = __p0_89; \ - float64_t __s1_89 = __p1_89; \ - float64x1_t __s2_89 = __p2_89; \ - float64_t __ret_89; \ - __ret_89 = __noswap_vfmad_lane_f64(__s0_89, -__s1_89, __s2_89, __p3_89); \ - __ret_89; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmss_lane_f32(__p0_90, __p1_90, __p2_90, __p3_90) __extension__ ({ \ - float32_t __s0_90 = __p0_90; \ - float32_t __s1_90 = __p1_90; \ - float32x2_t __s2_90 = __p2_90; \ - float32_t __ret_90; \ - __ret_90 = vfmas_lane_f32(__s0_90, -__s1_90, __s2_90, __p3_90); \ - __ret_90; \ -}) -#else -#define vfmss_lane_f32(__p0_91, __p1_91, __p2_91, __p3_91) __extension__ ({ \ - float32_t __s0_91 = __p0_91; \ - float32_t __s1_91 = __p1_91; \ - float32x2_t __s2_91 = __p2_91; \ - float32x2_t __rev2_91; __rev2_91 = __builtin_shufflevector(__s2_91, __s2_91, 1, 0); \ - float32_t __ret_91; \ - __ret_91 = __noswap_vfmas_lane_f32(__s0_91, -__s1_91, __rev2_91, __p3_91); \ - __ret_91; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmsq_lane_f64(__p0_92, __p1_92, __p2_92, __p3_92) __extension__ ({ \ - float64x2_t __s0_92 = __p0_92; \ - float64x2_t __s1_92 = __p1_92; \ - float64x1_t __s2_92 = __p2_92; \ - float64x2_t __ret_92; \ - __ret_92 = vfmaq_lane_f64(__s0_92, -__s1_92, __s2_92, __p3_92); \ - __ret_92; \ -}) -#else -#define vfmsq_lane_f64(__p0_93, __p1_93, __p2_93, __p3_93) __extension__ ({ \ - float64x2_t __s0_93 = __p0_93; \ - float64x2_t __s1_93 = __p1_93; \ - float64x1_t __s2_93 = __p2_93; \ - float64x2_t __rev0_93; __rev0_93 = __builtin_shufflevector(__s0_93, __s0_93, 1, 0); \ - float64x2_t __rev1_93; __rev1_93 = __builtin_shufflevector(__s1_93, __s1_93, 1, 0); \ - float64x2_t __ret_93; \ - __ret_93 = __noswap_vfmaq_lane_f64(__rev0_93, -__rev1_93, __s2_93, __p3_93); \ - __ret_93 = __builtin_shufflevector(__ret_93, __ret_93, 1, 0); \ - __ret_93; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmsq_lane_f32(__p0_94, __p1_94, __p2_94, __p3_94) __extension__ ({ \ - float32x4_t __s0_94 = __p0_94; \ - float32x4_t __s1_94 = __p1_94; \ - float32x2_t __s2_94 = __p2_94; \ - float32x4_t __ret_94; \ - __ret_94 = vfmaq_lane_f32(__s0_94, -__s1_94, __s2_94, __p3_94); \ - __ret_94; \ -}) -#else -#define vfmsq_lane_f32(__p0_95, __p1_95, __p2_95, __p3_95) __extension__ ({ \ - float32x4_t __s0_95 = __p0_95; \ - float32x4_t __s1_95 = __p1_95; \ - float32x2_t __s2_95 = __p2_95; \ - float32x4_t __rev0_95; __rev0_95 = __builtin_shufflevector(__s0_95, __s0_95, 3, 2, 1, 0); \ - float32x4_t __rev1_95; __rev1_95 = __builtin_shufflevector(__s1_95, __s1_95, 3, 2, 1, 0); \ - float32x2_t __rev2_95; __rev2_95 = __builtin_shufflevector(__s2_95, __s2_95, 1, 0); \ - float32x4_t __ret_95; \ - __ret_95 = __noswap_vfmaq_lane_f32(__rev0_95, -__rev1_95, __rev2_95, __p3_95); \ - __ret_95 = __builtin_shufflevector(__ret_95, __ret_95, 3, 2, 1, 0); \ - __ret_95; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfms_lane_f64(__p0_96, __p1_96, __p2_96, __p3_96) __extension__ ({ \ - float64x1_t __s0_96 = __p0_96; \ - float64x1_t __s1_96 = __p1_96; \ - float64x1_t __s2_96 = __p2_96; \ - float64x1_t __ret_96; \ - __ret_96 = vfma_lane_f64(__s0_96, -__s1_96, __s2_96, __p3_96); \ - __ret_96; \ -}) -#else -#define vfms_lane_f64(__p0_97, __p1_97, __p2_97, __p3_97) __extension__ ({ \ - float64x1_t __s0_97 = __p0_97; \ - float64x1_t __s1_97 = __p1_97; \ - float64x1_t __s2_97 = __p2_97; \ - float64x1_t __ret_97; \ - __ret_97 = __noswap_vfma_lane_f64(__s0_97, -__s1_97, __s2_97, __p3_97); \ - __ret_97; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfms_lane_f32(__p0_98, __p1_98, __p2_98, __p3_98) __extension__ ({ \ - float32x2_t __s0_98 = __p0_98; \ - float32x2_t __s1_98 = __p1_98; \ - float32x2_t __s2_98 = __p2_98; \ - float32x2_t __ret_98; \ - __ret_98 = vfma_lane_f32(__s0_98, -__s1_98, __s2_98, __p3_98); \ - __ret_98; \ -}) -#else -#define vfms_lane_f32(__p0_99, __p1_99, __p2_99, __p3_99) __extension__ ({ \ - float32x2_t __s0_99 = __p0_99; \ - float32x2_t __s1_99 = __p1_99; \ - float32x2_t __s2_99 = __p2_99; \ - float32x2_t __rev0_99; __rev0_99 = __builtin_shufflevector(__s0_99, __s0_99, 1, 0); \ - float32x2_t __rev1_99; __rev1_99 = __builtin_shufflevector(__s1_99, __s1_99, 1, 0); \ - float32x2_t __rev2_99; __rev2_99 = __builtin_shufflevector(__s2_99, __s2_99, 1, 0); \ - float32x2_t __ret_99; \ - __ret_99 = __noswap_vfma_lane_f32(__rev0_99, -__rev1_99, __rev2_99, __p3_99); \ - __ret_99 = __builtin_shufflevector(__ret_99, __ret_99, 1, 0); \ - __ret_99; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vfmsd_laneq_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \ +#define vfmsd_lane_f64(__p0_100, __p1_100, __p2_100, __p3_100) __extension__ ({ \ float64_t __s0_100 = __p0_100; \ float64_t __s1_100 = __p1_100; \ - float64x2_t __s2_100 = __p2_100; \ + float64x1_t __s2_100 = __p2_100; \ float64_t __ret_100; \ - __ret_100 = vfmad_laneq_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \ + __ret_100 = vfmad_lane_f64(__s0_100, -__s1_100, __s2_100, __p3_100); \ __ret_100; \ }) #else -#define vfmsd_laneq_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \ +#define vfmsd_lane_f64(__p0_101, __p1_101, __p2_101, __p3_101) __extension__ ({ \ float64_t __s0_101 = __p0_101; \ float64_t __s1_101 = __p1_101; \ - float64x2_t __s2_101 = __p2_101; \ - float64x2_t __rev2_101; __rev2_101 = __builtin_shufflevector(__s2_101, __s2_101, 1, 0); \ + float64x1_t __s2_101 = __p2_101; \ float64_t __ret_101; \ - __ret_101 = __noswap_vfmad_laneq_f64(__s0_101, -__s1_101, __rev2_101, __p3_101); \ + __ret_101 = __noswap_vfmad_lane_f64(__s0_101, -__s1_101, __s2_101, __p3_101); \ __ret_101; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmss_laneq_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \ +#define vfmss_lane_f32(__p0_102, __p1_102, __p2_102, __p3_102) __extension__ ({ \ float32_t __s0_102 = __p0_102; \ float32_t __s1_102 = __p1_102; \ - float32x4_t __s2_102 = __p2_102; \ + float32x2_t __s2_102 = __p2_102; \ float32_t __ret_102; \ - __ret_102 = vfmas_laneq_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \ + __ret_102 = vfmas_lane_f32(__s0_102, -__s1_102, __s2_102, __p3_102); \ __ret_102; \ }) #else -#define vfmss_laneq_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \ +#define vfmss_lane_f32(__p0_103, __p1_103, __p2_103, __p3_103) __extension__ ({ \ float32_t __s0_103 = __p0_103; \ float32_t __s1_103 = __p1_103; \ - float32x4_t __s2_103 = __p2_103; \ - float32x4_t __rev2_103; __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 3, 2, 1, 0); \ + float32x2_t __s2_103 = __p2_103; \ + float32x2_t __rev2_103; __rev2_103 = __builtin_shufflevector(__s2_103, __s2_103, 1, 0); \ float32_t __ret_103; \ - __ret_103 = __noswap_vfmas_laneq_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \ + __ret_103 = __noswap_vfmas_lane_f32(__s0_103, -__s1_103, __rev2_103, __p3_103); \ __ret_103; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmsq_laneq_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \ +#define vfmsq_lane_f64(__p0_104, __p1_104, __p2_104, __p3_104) __extension__ ({ \ float64x2_t __s0_104 = __p0_104; \ float64x2_t __s1_104 = __p1_104; \ - float64x2_t __s2_104 = __p2_104; \ + float64x1_t __s2_104 = __p2_104; \ float64x2_t __ret_104; \ - __ret_104 = vfmaq_laneq_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \ + __ret_104 = vfmaq_lane_f64(__s0_104, -__s1_104, __s2_104, __p3_104); \ __ret_104; \ }) #else -#define vfmsq_laneq_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \ +#define vfmsq_lane_f64(__p0_105, __p1_105, __p2_105, __p3_105) __extension__ ({ \ float64x2_t __s0_105 = __p0_105; \ float64x2_t __s1_105 = __p1_105; \ - float64x2_t __s2_105 = __p2_105; \ + float64x1_t __s2_105 = __p2_105; \ float64x2_t __rev0_105; __rev0_105 = __builtin_shufflevector(__s0_105, __s0_105, 1, 0); \ float64x2_t __rev1_105; __rev1_105 = __builtin_shufflevector(__s1_105, __s1_105, 1, 0); \ - float64x2_t __rev2_105; __rev2_105 = __builtin_shufflevector(__s2_105, __s2_105, 1, 0); \ float64x2_t __ret_105; \ - __ret_105 = __noswap_vfmaq_laneq_f64(__rev0_105, -__rev1_105, __rev2_105, __p3_105); \ + __ret_105 = __noswap_vfmaq_lane_f64(__rev0_105, -__rev1_105, __s2_105, __p3_105); \ __ret_105 = __builtin_shufflevector(__ret_105, __ret_105, 1, 0); \ __ret_105; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfmsq_laneq_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \ +#define vfmsq_lane_f32(__p0_106, __p1_106, __p2_106, __p3_106) __extension__ ({ \ float32x4_t __s0_106 = __p0_106; \ float32x4_t __s1_106 = __p1_106; \ - float32x4_t __s2_106 = __p2_106; \ + float32x2_t __s2_106 = __p2_106; \ float32x4_t __ret_106; \ - __ret_106 = vfmaq_laneq_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \ + __ret_106 = vfmaq_lane_f32(__s0_106, -__s1_106, __s2_106, __p3_106); \ __ret_106; \ }) #else -#define vfmsq_laneq_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \ +#define vfmsq_lane_f32(__p0_107, __p1_107, __p2_107, __p3_107) __extension__ ({ \ float32x4_t __s0_107 = __p0_107; \ float32x4_t __s1_107 = __p1_107; \ - float32x4_t __s2_107 = __p2_107; \ + float32x2_t __s2_107 = __p2_107; \ float32x4_t __rev0_107; __rev0_107 = __builtin_shufflevector(__s0_107, __s0_107, 3, 2, 1, 0); \ float32x4_t __rev1_107; __rev1_107 = __builtin_shufflevector(__s1_107, __s1_107, 3, 2, 1, 0); \ - float32x4_t __rev2_107; __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 3, 2, 1, 0); \ + float32x2_t __rev2_107; __rev2_107 = __builtin_shufflevector(__s2_107, __s2_107, 1, 0); \ float32x4_t __ret_107; \ - __ret_107 = __noswap_vfmaq_laneq_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \ + __ret_107 = __noswap_vfmaq_lane_f32(__rev0_107, -__rev1_107, __rev2_107, __p3_107); \ __ret_107 = __builtin_shufflevector(__ret_107, __ret_107, 3, 2, 1, 0); \ __ret_107; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfms_laneq_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \ +#define vfms_lane_f64(__p0_108, __p1_108, __p2_108, __p3_108) __extension__ ({ \ float64x1_t __s0_108 = __p0_108; \ float64x1_t __s1_108 = __p1_108; \ - float64x2_t __s2_108 = __p2_108; \ + float64x1_t __s2_108 = __p2_108; \ float64x1_t __ret_108; \ - __ret_108 = vfma_laneq_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \ + __ret_108 = vfma_lane_f64(__s0_108, -__s1_108, __s2_108, __p3_108); \ __ret_108; \ }) #else -#define vfms_laneq_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \ +#define vfms_lane_f64(__p0_109, __p1_109, __p2_109, __p3_109) __extension__ ({ \ float64x1_t __s0_109 = __p0_109; \ float64x1_t __s1_109 = __p1_109; \ - float64x2_t __s2_109 = __p2_109; \ - float64x2_t __rev2_109; __rev2_109 = __builtin_shufflevector(__s2_109, __s2_109, 1, 0); \ + float64x1_t __s2_109 = __p2_109; \ float64x1_t __ret_109; \ - __ret_109 = __noswap_vfma_laneq_f64(__s0_109, -__s1_109, __rev2_109, __p3_109); \ + __ret_109 = __noswap_vfma_lane_f64(__s0_109, -__s1_109, __s2_109, __p3_109); \ __ret_109; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vfms_laneq_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \ +#define vfms_lane_f32(__p0_110, __p1_110, __p2_110, __p3_110) __extension__ ({ \ float32x2_t __s0_110 = __p0_110; \ float32x2_t __s1_110 = __p1_110; \ - float32x4_t __s2_110 = __p2_110; \ + float32x2_t __s2_110 = __p2_110; \ float32x2_t __ret_110; \ - __ret_110 = vfma_laneq_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \ + __ret_110 = vfma_lane_f32(__s0_110, -__s1_110, __s2_110, __p3_110); \ __ret_110; \ }) #else -#define vfms_laneq_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \ +#define vfms_lane_f32(__p0_111, __p1_111, __p2_111, __p3_111) __extension__ ({ \ float32x2_t __s0_111 = __p0_111; \ float32x2_t __s1_111 = __p1_111; \ - float32x4_t __s2_111 = __p2_111; \ + float32x2_t __s2_111 = __p2_111; \ float32x2_t __rev0_111; __rev0_111 = __builtin_shufflevector(__s0_111, __s0_111, 1, 0); \ float32x2_t __rev1_111; __rev1_111 = __builtin_shufflevector(__s1_111, __s1_111, 1, 0); \ - float32x4_t __rev2_111; __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 3, 2, 1, 0); \ + float32x2_t __rev2_111; __rev2_111 = __builtin_shufflevector(__s2_111, __s2_111, 1, 0); \ float32x2_t __ret_111; \ - __ret_111 = __noswap_vfma_laneq_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \ + __ret_111 = __noswap_vfma_lane_f32(__rev0_111, -__rev1_111, __rev2_111, __p3_111); \ __ret_111 = __builtin_shufflevector(__ret_111, __ret_111, 1, 0); \ __ret_111; \ }) #endif #ifdef __LITTLE_ENDIAN__ +#define vfmsd_laneq_f64(__p0_112, __p1_112, __p2_112, __p3_112) __extension__ ({ \ + float64_t __s0_112 = __p0_112; \ + float64_t __s1_112 = __p1_112; \ + float64x2_t __s2_112 = __p2_112; \ + float64_t __ret_112; \ + __ret_112 = vfmad_laneq_f64(__s0_112, -__s1_112, __s2_112, __p3_112); \ + __ret_112; \ +}) +#else +#define vfmsd_laneq_f64(__p0_113, __p1_113, __p2_113, __p3_113) __extension__ ({ \ + float64_t __s0_113 = __p0_113; \ + float64_t __s1_113 = __p1_113; \ + float64x2_t __s2_113 = __p2_113; \ + float64x2_t __rev2_113; __rev2_113 = __builtin_shufflevector(__s2_113, __s2_113, 1, 0); \ + float64_t __ret_113; \ + __ret_113 = __noswap_vfmad_laneq_f64(__s0_113, -__s1_113, __rev2_113, __p3_113); \ + __ret_113; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmss_laneq_f32(__p0_114, __p1_114, __p2_114, __p3_114) __extension__ ({ \ + float32_t __s0_114 = __p0_114; \ + float32_t __s1_114 = __p1_114; \ + float32x4_t __s2_114 = __p2_114; \ + float32_t __ret_114; \ + __ret_114 = vfmas_laneq_f32(__s0_114, -__s1_114, __s2_114, __p3_114); \ + __ret_114; \ +}) +#else +#define vfmss_laneq_f32(__p0_115, __p1_115, __p2_115, __p3_115) __extension__ ({ \ + float32_t __s0_115 = __p0_115; \ + float32_t __s1_115 = __p1_115; \ + float32x4_t __s2_115 = __p2_115; \ + float32x4_t __rev2_115; __rev2_115 = __builtin_shufflevector(__s2_115, __s2_115, 3, 2, 1, 0); \ + float32_t __ret_115; \ + __ret_115 = __noswap_vfmas_laneq_f32(__s0_115, -__s1_115, __rev2_115, __p3_115); \ + __ret_115; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_laneq_f64(__p0_116, __p1_116, __p2_116, __p3_116) __extension__ ({ \ + float64x2_t __s0_116 = __p0_116; \ + float64x2_t __s1_116 = __p1_116; \ + float64x2_t __s2_116 = __p2_116; \ + float64x2_t __ret_116; \ + __ret_116 = vfmaq_laneq_f64(__s0_116, -__s1_116, __s2_116, __p3_116); \ + __ret_116; \ +}) +#else +#define vfmsq_laneq_f64(__p0_117, __p1_117, __p2_117, __p3_117) __extension__ ({ \ + float64x2_t __s0_117 = __p0_117; \ + float64x2_t __s1_117 = __p1_117; \ + float64x2_t __s2_117 = __p2_117; \ + float64x2_t __rev0_117; __rev0_117 = __builtin_shufflevector(__s0_117, __s0_117, 1, 0); \ + float64x2_t __rev1_117; __rev1_117 = __builtin_shufflevector(__s1_117, __s1_117, 1, 0); \ + float64x2_t __rev2_117; __rev2_117 = __builtin_shufflevector(__s2_117, __s2_117, 1, 0); \ + float64x2_t __ret_117; \ + __ret_117 = __noswap_vfmaq_laneq_f64(__rev0_117, -__rev1_117, __rev2_117, __p3_117); \ + __ret_117 = __builtin_shufflevector(__ret_117, __ret_117, 1, 0); \ + __ret_117; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsq_laneq_f32(__p0_118, __p1_118, __p2_118, __p3_118) __extension__ ({ \ + float32x4_t __s0_118 = __p0_118; \ + float32x4_t __s1_118 = __p1_118; \ + float32x4_t __s2_118 = __p2_118; \ + float32x4_t __ret_118; \ + __ret_118 = vfmaq_laneq_f32(__s0_118, -__s1_118, __s2_118, __p3_118); \ + __ret_118; \ +}) +#else +#define vfmsq_laneq_f32(__p0_119, __p1_119, __p2_119, __p3_119) __extension__ ({ \ + float32x4_t __s0_119 = __p0_119; \ + float32x4_t __s1_119 = __p1_119; \ + float32x4_t __s2_119 = __p2_119; \ + float32x4_t __rev0_119; __rev0_119 = __builtin_shufflevector(__s0_119, __s0_119, 3, 2, 1, 0); \ + float32x4_t __rev1_119; __rev1_119 = __builtin_shufflevector(__s1_119, __s1_119, 3, 2, 1, 0); \ + float32x4_t __rev2_119; __rev2_119 = __builtin_shufflevector(__s2_119, __s2_119, 3, 2, 1, 0); \ + float32x4_t __ret_119; \ + __ret_119 = __noswap_vfmaq_laneq_f32(__rev0_119, -__rev1_119, __rev2_119, __p3_119); \ + __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0); \ + __ret_119; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_laneq_f64(__p0_120, __p1_120, __p2_120, __p3_120) __extension__ ({ \ + float64x1_t __s0_120 = __p0_120; \ + float64x1_t __s1_120 = __p1_120; \ + float64x2_t __s2_120 = __p2_120; \ + float64x1_t __ret_120; \ + __ret_120 = vfma_laneq_f64(__s0_120, -__s1_120, __s2_120, __p3_120); \ + __ret_120; \ +}) +#else +#define vfms_laneq_f64(__p0_121, __p1_121, __p2_121, __p3_121) __extension__ ({ \ + float64x1_t __s0_121 = __p0_121; \ + float64x1_t __s1_121 = __p1_121; \ + float64x2_t __s2_121 = __p2_121; \ + float64x2_t __rev2_121; __rev2_121 = __builtin_shufflevector(__s2_121, __s2_121, 1, 0); \ + float64x1_t __ret_121; \ + __ret_121 = __noswap_vfma_laneq_f64(__s0_121, -__s1_121, __rev2_121, __p3_121); \ + __ret_121; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfms_laneq_f32(__p0_122, __p1_122, __p2_122, __p3_122) __extension__ ({ \ + float32x2_t __s0_122 = __p0_122; \ + float32x2_t __s1_122 = __p1_122; \ + float32x4_t __s2_122 = __p2_122; \ + float32x2_t __ret_122; \ + __ret_122 = vfma_laneq_f32(__s0_122, -__s1_122, __s2_122, __p3_122); \ + __ret_122; \ +}) +#else +#define vfms_laneq_f32(__p0_123, __p1_123, __p2_123, __p3_123) __extension__ ({ \ + float32x2_t __s0_123 = __p0_123; \ + float32x2_t __s1_123 = __p1_123; \ + float32x4_t __s2_123 = __p2_123; \ + float32x2_t __rev0_123; __rev0_123 = __builtin_shufflevector(__s0_123, __s0_123, 1, 0); \ + float32x2_t __rev1_123; __rev1_123 = __builtin_shufflevector(__s1_123, __s1_123, 1, 0); \ + float32x4_t __rev2_123; __rev2_123 = __builtin_shufflevector(__s2_123, __s2_123, 3, 2, 1, 0); \ + float32x2_t __ret_123; \ + __ret_123 = __noswap_vfma_laneq_f32(__rev0_123, -__rev1_123, __rev2_123, __p3_123); \ + __ret_123 = __builtin_shufflevector(__ret_123, __ret_123, 1, 0); \ + __ret_123; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ __ai float64x2_t vfmsq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) { float64x2_t __ret; __ret = vfmaq_f64(__p0, -__p1, (float64x2_t) {__p2, __p2}); @@ -53521,146 +56833,146 @@ __ai float64x1_t vmov_n_f64(float64_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_112) { - uint16x8_t __ret_112; - uint8x8_t __a1_112 = vget_high_u8(__p0_112); - __ret_112 = (uint16x8_t)(vshll_n_u8(__a1_112, 0)); - return __ret_112; +__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_124) { + uint16x8_t __ret_124; + uint8x8_t __a1_124 = vget_high_u8(__p0_124); + __ret_124 = (uint16x8_t)(vshll_n_u8(__a1_124, 0)); + return __ret_124; } #else -__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_113) { - uint8x16_t __rev0_113; __rev0_113 = __builtin_shufflevector(__p0_113, __p0_113, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint16x8_t __ret_113; - uint8x8_t __a1_113 = __noswap_vget_high_u8(__rev0_113); - __ret_113 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_113, 0)); - __ret_113 = __builtin_shufflevector(__ret_113, __ret_113, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret_113; +__ai uint16x8_t vmovl_high_u8(uint8x16_t __p0_125) { + uint8x16_t __rev0_125; __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint16x8_t __ret_125; + uint8x8_t __a1_125 = __noswap_vget_high_u8(__rev0_125); + __ret_125 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_125, 0)); + __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret_125; } -__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_114) { - uint16x8_t __ret_114; - uint8x8_t __a1_114 = __noswap_vget_high_u8(__p0_114); - __ret_114 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_114, 0)); - return __ret_114; +__ai uint16x8_t __noswap_vmovl_high_u8(uint8x16_t __p0_126) { + uint16x8_t __ret_126; + uint8x8_t __a1_126 = __noswap_vget_high_u8(__p0_126); + __ret_126 = (uint16x8_t)(__noswap_vshll_n_u8(__a1_126, 0)); + return __ret_126; } #endif #ifdef __LITTLE_ENDIAN__ -__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_115) { - uint64x2_t __ret_115; - uint32x2_t __a1_115 = vget_high_u32(__p0_115); - __ret_115 = (uint64x2_t)(vshll_n_u32(__a1_115, 0)); - return __ret_115; +__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_127) { + uint64x2_t __ret_127; + uint32x2_t __a1_127 = vget_high_u32(__p0_127); + __ret_127 = (uint64x2_t)(vshll_n_u32(__a1_127, 0)); + return __ret_127; } #else -__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_116) { - uint32x4_t __rev0_116; __rev0_116 = __builtin_shufflevector(__p0_116, __p0_116, 3, 2, 1, 0); - uint64x2_t __ret_116; - uint32x2_t __a1_116 = __noswap_vget_high_u32(__rev0_116); - __ret_116 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_116, 0)); - __ret_116 = __builtin_shufflevector(__ret_116, __ret_116, 1, 0); - return __ret_116; +__ai uint64x2_t vmovl_high_u32(uint32x4_t __p0_128) { + uint32x4_t __rev0_128; __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 3, 2, 1, 0); + uint64x2_t __ret_128; + uint32x2_t __a1_128 = __noswap_vget_high_u32(__rev0_128); + __ret_128 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_128, 0)); + __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 1, 0); + return __ret_128; } -__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_117) { - uint64x2_t __ret_117; - uint32x2_t __a1_117 = __noswap_vget_high_u32(__p0_117); - __ret_117 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_117, 0)); - return __ret_117; +__ai uint64x2_t __noswap_vmovl_high_u32(uint32x4_t __p0_129) { + uint64x2_t __ret_129; + uint32x2_t __a1_129 = __noswap_vget_high_u32(__p0_129); + __ret_129 = (uint64x2_t)(__noswap_vshll_n_u32(__a1_129, 0)); + return __ret_129; } #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_118) { - uint32x4_t __ret_118; - uint16x4_t __a1_118 = vget_high_u16(__p0_118); - __ret_118 = (uint32x4_t)(vshll_n_u16(__a1_118, 0)); - return __ret_118; +__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_130) { + uint32x4_t __ret_130; + uint16x4_t __a1_130 = vget_high_u16(__p0_130); + __ret_130 = (uint32x4_t)(vshll_n_u16(__a1_130, 0)); + return __ret_130; } #else -__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_119) { - uint16x8_t __rev0_119; __rev0_119 = __builtin_shufflevector(__p0_119, __p0_119, 7, 6, 5, 4, 3, 2, 1, 0); - uint32x4_t __ret_119; - uint16x4_t __a1_119 = __noswap_vget_high_u16(__rev0_119); - __ret_119 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_119, 0)); - __ret_119 = __builtin_shufflevector(__ret_119, __ret_119, 3, 2, 1, 0); - return __ret_119; +__ai uint32x4_t vmovl_high_u16(uint16x8_t __p0_131) { + uint16x8_t __rev0_131; __rev0_131 = __builtin_shufflevector(__p0_131, __p0_131, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __ret_131; + uint16x4_t __a1_131 = __noswap_vget_high_u16(__rev0_131); + __ret_131 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_131, 0)); + __ret_131 = __builtin_shufflevector(__ret_131, __ret_131, 3, 2, 1, 0); + return __ret_131; } -__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_120) { - uint32x4_t __ret_120; - uint16x4_t __a1_120 = __noswap_vget_high_u16(__p0_120); - __ret_120 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_120, 0)); - return __ret_120; +__ai uint32x4_t __noswap_vmovl_high_u16(uint16x8_t __p0_132) { + uint32x4_t __ret_132; + uint16x4_t __a1_132 = __noswap_vget_high_u16(__p0_132); + __ret_132 = (uint32x4_t)(__noswap_vshll_n_u16(__a1_132, 0)); + return __ret_132; } #endif #ifdef __LITTLE_ENDIAN__ -__ai int16x8_t vmovl_high_s8(int8x16_t __p0_121) { - int16x8_t __ret_121; - int8x8_t __a1_121 = vget_high_s8(__p0_121); - __ret_121 = (int16x8_t)(vshll_n_s8(__a1_121, 0)); - return __ret_121; +__ai int16x8_t vmovl_high_s8(int8x16_t __p0_133) { + int16x8_t __ret_133; + int8x8_t __a1_133 = vget_high_s8(__p0_133); + __ret_133 = (int16x8_t)(vshll_n_s8(__a1_133, 0)); + return __ret_133; } #else -__ai int16x8_t vmovl_high_s8(int8x16_t __p0_122) { - int8x16_t __rev0_122; __rev0_122 = __builtin_shufflevector(__p0_122, __p0_122, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - int16x8_t __ret_122; - int8x8_t __a1_122 = __noswap_vget_high_s8(__rev0_122); - __ret_122 = (int16x8_t)(__noswap_vshll_n_s8(__a1_122, 0)); - __ret_122 = __builtin_shufflevector(__ret_122, __ret_122, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret_122; +__ai int16x8_t vmovl_high_s8(int8x16_t __p0_134) { + int8x16_t __rev0_134; __rev0_134 = __builtin_shufflevector(__p0_134, __p0_134, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + int16x8_t __ret_134; + int8x8_t __a1_134 = __noswap_vget_high_s8(__rev0_134); + __ret_134 = (int16x8_t)(__noswap_vshll_n_s8(__a1_134, 0)); + __ret_134 = __builtin_shufflevector(__ret_134, __ret_134, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret_134; } -__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_123) { - int16x8_t __ret_123; - int8x8_t __a1_123 = __noswap_vget_high_s8(__p0_123); - __ret_123 = (int16x8_t)(__noswap_vshll_n_s8(__a1_123, 0)); - return __ret_123; +__ai int16x8_t __noswap_vmovl_high_s8(int8x16_t __p0_135) { + int16x8_t __ret_135; + int8x8_t __a1_135 = __noswap_vget_high_s8(__p0_135); + __ret_135 = (int16x8_t)(__noswap_vshll_n_s8(__a1_135, 0)); + return __ret_135; } #endif #ifdef __LITTLE_ENDIAN__ -__ai int64x2_t vmovl_high_s32(int32x4_t __p0_124) { - int64x2_t __ret_124; - int32x2_t __a1_124 = vget_high_s32(__p0_124); - __ret_124 = (int64x2_t)(vshll_n_s32(__a1_124, 0)); - return __ret_124; +__ai int64x2_t vmovl_high_s32(int32x4_t __p0_136) { + int64x2_t __ret_136; + int32x2_t __a1_136 = vget_high_s32(__p0_136); + __ret_136 = (int64x2_t)(vshll_n_s32(__a1_136, 0)); + return __ret_136; } #else -__ai int64x2_t vmovl_high_s32(int32x4_t __p0_125) { - int32x4_t __rev0_125; __rev0_125 = __builtin_shufflevector(__p0_125, __p0_125, 3, 2, 1, 0); - int64x2_t __ret_125; - int32x2_t __a1_125 = __noswap_vget_high_s32(__rev0_125); - __ret_125 = (int64x2_t)(__noswap_vshll_n_s32(__a1_125, 0)); - __ret_125 = __builtin_shufflevector(__ret_125, __ret_125, 1, 0); - return __ret_125; +__ai int64x2_t vmovl_high_s32(int32x4_t __p0_137) { + int32x4_t __rev0_137; __rev0_137 = __builtin_shufflevector(__p0_137, __p0_137, 3, 2, 1, 0); + int64x2_t __ret_137; + int32x2_t __a1_137 = __noswap_vget_high_s32(__rev0_137); + __ret_137 = (int64x2_t)(__noswap_vshll_n_s32(__a1_137, 0)); + __ret_137 = __builtin_shufflevector(__ret_137, __ret_137, 1, 0); + return __ret_137; } -__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_126) { - int64x2_t __ret_126; - int32x2_t __a1_126 = __noswap_vget_high_s32(__p0_126); - __ret_126 = (int64x2_t)(__noswap_vshll_n_s32(__a1_126, 0)); - return __ret_126; +__ai int64x2_t __noswap_vmovl_high_s32(int32x4_t __p0_138) { + int64x2_t __ret_138; + int32x2_t __a1_138 = __noswap_vget_high_s32(__p0_138); + __ret_138 = (int64x2_t)(__noswap_vshll_n_s32(__a1_138, 0)); + return __ret_138; } #endif #ifdef __LITTLE_ENDIAN__ -__ai int32x4_t vmovl_high_s16(int16x8_t __p0_127) { - int32x4_t __ret_127; - int16x4_t __a1_127 = vget_high_s16(__p0_127); - __ret_127 = (int32x4_t)(vshll_n_s16(__a1_127, 0)); - return __ret_127; +__ai int32x4_t vmovl_high_s16(int16x8_t __p0_139) { + int32x4_t __ret_139; + int16x4_t __a1_139 = vget_high_s16(__p0_139); + __ret_139 = (int32x4_t)(vshll_n_s16(__a1_139, 0)); + return __ret_139; } #else -__ai int32x4_t vmovl_high_s16(int16x8_t __p0_128) { - int16x8_t __rev0_128; __rev0_128 = __builtin_shufflevector(__p0_128, __p0_128, 7, 6, 5, 4, 3, 2, 1, 0); - int32x4_t __ret_128; - int16x4_t __a1_128 = __noswap_vget_high_s16(__rev0_128); - __ret_128 = (int32x4_t)(__noswap_vshll_n_s16(__a1_128, 0)); - __ret_128 = __builtin_shufflevector(__ret_128, __ret_128, 3, 2, 1, 0); - return __ret_128; +__ai int32x4_t vmovl_high_s16(int16x8_t __p0_140) { + int16x8_t __rev0_140; __rev0_140 = __builtin_shufflevector(__p0_140, __p0_140, 7, 6, 5, 4, 3, 2, 1, 0); + int32x4_t __ret_140; + int16x4_t __a1_140 = __noswap_vget_high_s16(__rev0_140); + __ret_140 = (int32x4_t)(__noswap_vshll_n_s16(__a1_140, 0)); + __ret_140 = __builtin_shufflevector(__ret_140, __ret_140, 3, 2, 1, 0); + return __ret_140; } -__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_129) { - int32x4_t __ret_129; - int16x4_t __a1_129 = __noswap_vget_high_s16(__p0_129); - __ret_129 = (int32x4_t)(__noswap_vshll_n_s16(__a1_129, 0)); - return __ret_129; +__ai int32x4_t __noswap_vmovl_high_s16(int16x8_t __p0_141) { + int32x4_t __ret_141; + int16x4_t __a1_141 = __noswap_vget_high_s16(__p0_141); + __ret_141 = (int32x4_t)(__noswap_vshll_n_s16(__a1_141, 0)); + return __ret_141; } #endif @@ -53798,39 +57110,39 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmuld_lane_f64(__p0_130, __p1_130, __p2_130) __extension__ ({ \ - float64_t __s0_130 = __p0_130; \ - float64x1_t __s1_130 = __p1_130; \ - float64_t __ret_130; \ - __ret_130 = __s0_130 * vget_lane_f64(__s1_130, __p2_130); \ - __ret_130; \ +#define vmuld_lane_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \ + float64_t __s0_142 = __p0_142; \ + float64x1_t __s1_142 = __p1_142; \ + float64_t __ret_142; \ + __ret_142 = __s0_142 * vget_lane_f64(__s1_142, __p2_142); \ + __ret_142; \ }) #else -#define vmuld_lane_f64(__p0_131, __p1_131, __p2_131) __extension__ ({ \ - float64_t __s0_131 = __p0_131; \ - float64x1_t __s1_131 = __p1_131; \ - float64_t __ret_131; \ - __ret_131 = __s0_131 * __noswap_vget_lane_f64(__s1_131, __p2_131); \ - __ret_131; \ +#define vmuld_lane_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \ + float64_t __s0_143 = __p0_143; \ + float64x1_t __s1_143 = __p1_143; \ + float64_t __ret_143; \ + __ret_143 = __s0_143 * __noswap_vget_lane_f64(__s1_143, __p2_143); \ + __ret_143; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmuls_lane_f32(__p0_132, __p1_132, __p2_132) __extension__ ({ \ - float32_t __s0_132 = __p0_132; \ - float32x2_t __s1_132 = __p1_132; \ - float32_t __ret_132; \ - __ret_132 = __s0_132 * vget_lane_f32(__s1_132, __p2_132); \ - __ret_132; \ +#define vmuls_lane_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \ + float32_t __s0_144 = __p0_144; \ + float32x2_t __s1_144 = __p1_144; \ + float32_t __ret_144; \ + __ret_144 = __s0_144 * vget_lane_f32(__s1_144, __p2_144); \ + __ret_144; \ }) #else -#define vmuls_lane_f32(__p0_133, __p1_133, __p2_133) __extension__ ({ \ - float32_t __s0_133 = __p0_133; \ - float32x2_t __s1_133 = __p1_133; \ - float32x2_t __rev1_133; __rev1_133 = __builtin_shufflevector(__s1_133, __s1_133, 1, 0); \ - float32_t __ret_133; \ - __ret_133 = __s0_133 * __noswap_vget_lane_f32(__rev1_133, __p2_133); \ - __ret_133; \ +#define vmuls_lane_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \ + float32_t __s0_145 = __p0_145; \ + float32x2_t __s1_145 = __p1_145; \ + float32x2_t __rev1_145; __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 1, 0); \ + float32_t __ret_145; \ + __ret_145 = __s0_145 * __noswap_vget_lane_f32(__rev1_145, __p2_145); \ + __ret_145; \ }) #endif @@ -53873,40 +57185,40 @@ __ai float64x1_t vmul_f64(float64x1_t __p0, float64x1_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmuld_laneq_f64(__p0_134, __p1_134, __p2_134) __extension__ ({ \ - float64_t __s0_134 = __p0_134; \ - float64x2_t __s1_134 = __p1_134; \ - float64_t __ret_134; \ - __ret_134 = __s0_134 * vgetq_lane_f64(__s1_134, __p2_134); \ - __ret_134; \ +#define vmuld_laneq_f64(__p0_146, __p1_146, __p2_146) __extension__ ({ \ + float64_t __s0_146 = __p0_146; \ + float64x2_t __s1_146 = __p1_146; \ + float64_t __ret_146; \ + __ret_146 = __s0_146 * vgetq_lane_f64(__s1_146, __p2_146); \ + __ret_146; \ }) #else -#define vmuld_laneq_f64(__p0_135, __p1_135, __p2_135) __extension__ ({ \ - float64_t __s0_135 = __p0_135; \ - float64x2_t __s1_135 = __p1_135; \ - float64x2_t __rev1_135; __rev1_135 = __builtin_shufflevector(__s1_135, __s1_135, 1, 0); \ - float64_t __ret_135; \ - __ret_135 = __s0_135 * __noswap_vgetq_lane_f64(__rev1_135, __p2_135); \ - __ret_135; \ +#define vmuld_laneq_f64(__p0_147, __p1_147, __p2_147) __extension__ ({ \ + float64_t __s0_147 = __p0_147; \ + float64x2_t __s1_147 = __p1_147; \ + float64x2_t __rev1_147; __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \ + float64_t __ret_147; \ + __ret_147 = __s0_147 * __noswap_vgetq_lane_f64(__rev1_147, __p2_147); \ + __ret_147; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmuls_laneq_f32(__p0_136, __p1_136, __p2_136) __extension__ ({ \ - float32_t __s0_136 = __p0_136; \ - float32x4_t __s1_136 = __p1_136; \ - float32_t __ret_136; \ - __ret_136 = __s0_136 * vgetq_lane_f32(__s1_136, __p2_136); \ - __ret_136; \ +#define vmuls_laneq_f32(__p0_148, __p1_148, __p2_148) __extension__ ({ \ + float32_t __s0_148 = __p0_148; \ + float32x4_t __s1_148 = __p1_148; \ + float32_t __ret_148; \ + __ret_148 = __s0_148 * vgetq_lane_f32(__s1_148, __p2_148); \ + __ret_148; \ }) #else -#define vmuls_laneq_f32(__p0_137, __p1_137, __p2_137) __extension__ ({ \ - float32_t __s0_137 = __p0_137; \ - float32x4_t __s1_137 = __p1_137; \ - float32x4_t __rev1_137; __rev1_137 = __builtin_shufflevector(__s1_137, __s1_137, 3, 2, 1, 0); \ - float32_t __ret_137; \ - __ret_137 = __s0_137 * __noswap_vgetq_lane_f32(__rev1_137, __p2_137); \ - __ret_137; \ +#define vmuls_laneq_f32(__p0_149, __p1_149, __p2_149) __extension__ ({ \ + float32_t __s0_149 = __p0_149; \ + float32x4_t __s1_149 = __p1_149; \ + float32x4_t __rev1_149; __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \ + float32_t __ret_149; \ + __ret_149 = __s0_149 * __noswap_vgetq_lane_f32(__rev1_149, __p2_149); \ + __ret_149; \ }) #endif @@ -54779,39 +58091,39 @@ __ai float32_t __noswap_vmulxs_f32(float32_t __p0, float32_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxd_lane_f64(__p0_138, __p1_138, __p2_138) __extension__ ({ \ - float64_t __s0_138 = __p0_138; \ - float64x1_t __s1_138 = __p1_138; \ - float64_t __ret_138; \ - __ret_138 = vmulxd_f64(__s0_138, vget_lane_f64(__s1_138, __p2_138)); \ - __ret_138; \ +#define vmulxd_lane_f64(__p0_150, __p1_150, __p2_150) __extension__ ({ \ + float64_t __s0_150 = __p0_150; \ + float64x1_t __s1_150 = __p1_150; \ + float64_t __ret_150; \ + __ret_150 = vmulxd_f64(__s0_150, vget_lane_f64(__s1_150, __p2_150)); \ + __ret_150; \ }) #else -#define vmulxd_lane_f64(__p0_139, __p1_139, __p2_139) __extension__ ({ \ - float64_t __s0_139 = __p0_139; \ - float64x1_t __s1_139 = __p1_139; \ - float64_t __ret_139; \ - __ret_139 = __noswap_vmulxd_f64(__s0_139, __noswap_vget_lane_f64(__s1_139, __p2_139)); \ - __ret_139; \ +#define vmulxd_lane_f64(__p0_151, __p1_151, __p2_151) __extension__ ({ \ + float64_t __s0_151 = __p0_151; \ + float64x1_t __s1_151 = __p1_151; \ + float64_t __ret_151; \ + __ret_151 = __noswap_vmulxd_f64(__s0_151, __noswap_vget_lane_f64(__s1_151, __p2_151)); \ + __ret_151; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxs_lane_f32(__p0_140, __p1_140, __p2_140) __extension__ ({ \ - float32_t __s0_140 = __p0_140; \ - float32x2_t __s1_140 = __p1_140; \ - float32_t __ret_140; \ - __ret_140 = vmulxs_f32(__s0_140, vget_lane_f32(__s1_140, __p2_140)); \ - __ret_140; \ +#define vmulxs_lane_f32(__p0_152, __p1_152, __p2_152) __extension__ ({ \ + float32_t __s0_152 = __p0_152; \ + float32x2_t __s1_152 = __p1_152; \ + float32_t __ret_152; \ + __ret_152 = vmulxs_f32(__s0_152, vget_lane_f32(__s1_152, __p2_152)); \ + __ret_152; \ }) #else -#define vmulxs_lane_f32(__p0_141, __p1_141, __p2_141) __extension__ ({ \ - float32_t __s0_141 = __p0_141; \ - float32x2_t __s1_141 = __p1_141; \ - float32x2_t __rev1_141; __rev1_141 = __builtin_shufflevector(__s1_141, __s1_141, 1, 0); \ - float32_t __ret_141; \ - __ret_141 = __noswap_vmulxs_f32(__s0_141, __noswap_vget_lane_f32(__rev1_141, __p2_141)); \ - __ret_141; \ +#define vmulxs_lane_f32(__p0_153, __p1_153, __p2_153) __extension__ ({ \ + float32_t __s0_153 = __p0_153; \ + float32x2_t __s1_153 = __p1_153; \ + float32x2_t __rev1_153; __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 1, 0); \ + float32_t __ret_153; \ + __ret_153 = __noswap_vmulxs_f32(__s0_153, __noswap_vget_lane_f32(__rev1_153, __p2_153)); \ + __ret_153; \ }) #endif @@ -54878,40 +58190,40 @@ __ai float32_t __noswap_vmulxs_f32(float32_t __p0, float32_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxd_laneq_f64(__p0_142, __p1_142, __p2_142) __extension__ ({ \ - float64_t __s0_142 = __p0_142; \ - float64x2_t __s1_142 = __p1_142; \ - float64_t __ret_142; \ - __ret_142 = vmulxd_f64(__s0_142, vgetq_lane_f64(__s1_142, __p2_142)); \ - __ret_142; \ +#define vmulxd_laneq_f64(__p0_154, __p1_154, __p2_154) __extension__ ({ \ + float64_t __s0_154 = __p0_154; \ + float64x2_t __s1_154 = __p1_154; \ + float64_t __ret_154; \ + __ret_154 = vmulxd_f64(__s0_154, vgetq_lane_f64(__s1_154, __p2_154)); \ + __ret_154; \ }) #else -#define vmulxd_laneq_f64(__p0_143, __p1_143, __p2_143) __extension__ ({ \ - float64_t __s0_143 = __p0_143; \ - float64x2_t __s1_143 = __p1_143; \ - float64x2_t __rev1_143; __rev1_143 = __builtin_shufflevector(__s1_143, __s1_143, 1, 0); \ - float64_t __ret_143; \ - __ret_143 = __noswap_vmulxd_f64(__s0_143, __noswap_vgetq_lane_f64(__rev1_143, __p2_143)); \ - __ret_143; \ +#define vmulxd_laneq_f64(__p0_155, __p1_155, __p2_155) __extension__ ({ \ + float64_t __s0_155 = __p0_155; \ + float64x2_t __s1_155 = __p1_155; \ + float64x2_t __rev1_155; __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \ + float64_t __ret_155; \ + __ret_155 = __noswap_vmulxd_f64(__s0_155, __noswap_vgetq_lane_f64(__rev1_155, __p2_155)); \ + __ret_155; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulxs_laneq_f32(__p0_144, __p1_144, __p2_144) __extension__ ({ \ - float32_t __s0_144 = __p0_144; \ - float32x4_t __s1_144 = __p1_144; \ - float32_t __ret_144; \ - __ret_144 = vmulxs_f32(__s0_144, vgetq_lane_f32(__s1_144, __p2_144)); \ - __ret_144; \ +#define vmulxs_laneq_f32(__p0_156, __p1_156, __p2_156) __extension__ ({ \ + float32_t __s0_156 = __p0_156; \ + float32x4_t __s1_156 = __p1_156; \ + float32_t __ret_156; \ + __ret_156 = vmulxs_f32(__s0_156, vgetq_lane_f32(__s1_156, __p2_156)); \ + __ret_156; \ }) #else -#define vmulxs_laneq_f32(__p0_145, __p1_145, __p2_145) __extension__ ({ \ - float32_t __s0_145 = __p0_145; \ - float32x4_t __s1_145 = __p1_145; \ - float32x4_t __rev1_145; __rev1_145 = __builtin_shufflevector(__s1_145, __s1_145, 3, 2, 1, 0); \ - float32_t __ret_145; \ - __ret_145 = __noswap_vmulxs_f32(__s0_145, __noswap_vgetq_lane_f32(__rev1_145, __p2_145)); \ - __ret_145; \ +#define vmulxs_laneq_f32(__p0_157, __p1_157, __p2_157) __extension__ ({ \ + float32_t __s0_157 = __p0_157; \ + float32x4_t __s1_157 = __p1_157; \ + float32x4_t __rev1_157; __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \ + float32_t __ret_157; \ + __ret_157 = __noswap_vmulxs_f32(__s0_157, __noswap_vgetq_lane_f32(__rev1_157, __p2_157)); \ + __ret_157; \ }) #endif @@ -56675,78 +59987,78 @@ __ai int16_t __noswap_vqdmulhh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhs_lane_s32(__p0_146, __p1_146, __p2_146) __extension__ ({ \ - int32_t __s0_146 = __p0_146; \ - int32x2_t __s1_146 = __p1_146; \ - int32_t __ret_146; \ - __ret_146 = vqdmulhs_s32(__s0_146, vget_lane_s32(__s1_146, __p2_146)); \ - __ret_146; \ +#define vqdmulhs_lane_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \ + int32_t __s0_158 = __p0_158; \ + int32x2_t __s1_158 = __p1_158; \ + int32_t __ret_158; \ + __ret_158 = vqdmulhs_s32(__s0_158, vget_lane_s32(__s1_158, __p2_158)); \ + __ret_158; \ }) #else -#define vqdmulhs_lane_s32(__p0_147, __p1_147, __p2_147) __extension__ ({ \ - int32_t __s0_147 = __p0_147; \ - int32x2_t __s1_147 = __p1_147; \ - int32x2_t __rev1_147; __rev1_147 = __builtin_shufflevector(__s1_147, __s1_147, 1, 0); \ - int32_t __ret_147; \ - __ret_147 = __noswap_vqdmulhs_s32(__s0_147, __noswap_vget_lane_s32(__rev1_147, __p2_147)); \ - __ret_147; \ +#define vqdmulhs_lane_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \ + int32_t __s0_159 = __p0_159; \ + int32x2_t __s1_159 = __p1_159; \ + int32x2_t __rev1_159; __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 1, 0); \ + int32_t __ret_159; \ + __ret_159 = __noswap_vqdmulhs_s32(__s0_159, __noswap_vget_lane_s32(__rev1_159, __p2_159)); \ + __ret_159; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhh_lane_s16(__p0_148, __p1_148, __p2_148) __extension__ ({ \ - int16_t __s0_148 = __p0_148; \ - int16x4_t __s1_148 = __p1_148; \ - int16_t __ret_148; \ - __ret_148 = vqdmulhh_s16(__s0_148, vget_lane_s16(__s1_148, __p2_148)); \ - __ret_148; \ +#define vqdmulhh_lane_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \ + int16_t __s0_160 = __p0_160; \ + int16x4_t __s1_160 = __p1_160; \ + int16_t __ret_160; \ + __ret_160 = vqdmulhh_s16(__s0_160, vget_lane_s16(__s1_160, __p2_160)); \ + __ret_160; \ }) #else -#define vqdmulhh_lane_s16(__p0_149, __p1_149, __p2_149) __extension__ ({ \ - int16_t __s0_149 = __p0_149; \ - int16x4_t __s1_149 = __p1_149; \ - int16x4_t __rev1_149; __rev1_149 = __builtin_shufflevector(__s1_149, __s1_149, 3, 2, 1, 0); \ - int16_t __ret_149; \ - __ret_149 = __noswap_vqdmulhh_s16(__s0_149, __noswap_vget_lane_s16(__rev1_149, __p2_149)); \ - __ret_149; \ +#define vqdmulhh_lane_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \ + int16_t __s0_161 = __p0_161; \ + int16x4_t __s1_161 = __p1_161; \ + int16x4_t __rev1_161; __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 3, 2, 1, 0); \ + int16_t __ret_161; \ + __ret_161 = __noswap_vqdmulhh_s16(__s0_161, __noswap_vget_lane_s16(__rev1_161, __p2_161)); \ + __ret_161; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhs_laneq_s32(__p0_150, __p1_150, __p2_150) __extension__ ({ \ - int32_t __s0_150 = __p0_150; \ - int32x4_t __s1_150 = __p1_150; \ - int32_t __ret_150; \ - __ret_150 = vqdmulhs_s32(__s0_150, vgetq_lane_s32(__s1_150, __p2_150)); \ - __ret_150; \ +#define vqdmulhs_laneq_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \ + int32_t __s0_162 = __p0_162; \ + int32x4_t __s1_162 = __p1_162; \ + int32_t __ret_162; \ + __ret_162 = vqdmulhs_s32(__s0_162, vgetq_lane_s32(__s1_162, __p2_162)); \ + __ret_162; \ }) #else -#define vqdmulhs_laneq_s32(__p0_151, __p1_151, __p2_151) __extension__ ({ \ - int32_t __s0_151 = __p0_151; \ - int32x4_t __s1_151 = __p1_151; \ - int32x4_t __rev1_151; __rev1_151 = __builtin_shufflevector(__s1_151, __s1_151, 3, 2, 1, 0); \ - int32_t __ret_151; \ - __ret_151 = __noswap_vqdmulhs_s32(__s0_151, __noswap_vgetq_lane_s32(__rev1_151, __p2_151)); \ - __ret_151; \ +#define vqdmulhs_laneq_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \ + int32_t __s0_163 = __p0_163; \ + int32x4_t __s1_163 = __p1_163; \ + int32x4_t __rev1_163; __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 3, 2, 1, 0); \ + int32_t __ret_163; \ + __ret_163 = __noswap_vqdmulhs_s32(__s0_163, __noswap_vgetq_lane_s32(__rev1_163, __p2_163)); \ + __ret_163; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulhh_laneq_s16(__p0_152, __p1_152, __p2_152) __extension__ ({ \ - int16_t __s0_152 = __p0_152; \ - int16x8_t __s1_152 = __p1_152; \ - int16_t __ret_152; \ - __ret_152 = vqdmulhh_s16(__s0_152, vgetq_lane_s16(__s1_152, __p2_152)); \ - __ret_152; \ +#define vqdmulhh_laneq_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \ + int16_t __s0_164 = __p0_164; \ + int16x8_t __s1_164 = __p1_164; \ + int16_t __ret_164; \ + __ret_164 = vqdmulhh_s16(__s0_164, vgetq_lane_s16(__s1_164, __p2_164)); \ + __ret_164; \ }) #else -#define vqdmulhh_laneq_s16(__p0_153, __p1_153, __p2_153) __extension__ ({ \ - int16_t __s0_153 = __p0_153; \ - int16x8_t __s1_153 = __p1_153; \ - int16x8_t __rev1_153; __rev1_153 = __builtin_shufflevector(__s1_153, __s1_153, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_153; \ - __ret_153 = __noswap_vqdmulhh_s16(__s0_153, __noswap_vgetq_lane_s16(__rev1_153, __p2_153)); \ - __ret_153; \ +#define vqdmulhh_laneq_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \ + int16_t __s0_165 = __p0_165; \ + int16x8_t __s1_165 = __p1_165; \ + int16x8_t __rev1_165; __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_165; \ + __ret_165 = __noswap_vqdmulhh_s16(__s0_165, __noswap_vgetq_lane_s16(__rev1_165, __p2_165)); \ + __ret_165; \ }) #endif @@ -57023,78 +60335,78 @@ __ai int32x4_t vqdmull_high_n_s16(int16x8_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulls_lane_s32(__p0_154, __p1_154, __p2_154) __extension__ ({ \ - int32_t __s0_154 = __p0_154; \ - int32x2_t __s1_154 = __p1_154; \ - int64_t __ret_154; \ - __ret_154 = vqdmulls_s32(__s0_154, vget_lane_s32(__s1_154, __p2_154)); \ - __ret_154; \ +#define vqdmulls_lane_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \ + int32_t __s0_166 = __p0_166; \ + int32x2_t __s1_166 = __p1_166; \ + int64_t __ret_166; \ + __ret_166 = vqdmulls_s32(__s0_166, vget_lane_s32(__s1_166, __p2_166)); \ + __ret_166; \ }) #else -#define vqdmulls_lane_s32(__p0_155, __p1_155, __p2_155) __extension__ ({ \ - int32_t __s0_155 = __p0_155; \ - int32x2_t __s1_155 = __p1_155; \ - int32x2_t __rev1_155; __rev1_155 = __builtin_shufflevector(__s1_155, __s1_155, 1, 0); \ - int64_t __ret_155; \ - __ret_155 = __noswap_vqdmulls_s32(__s0_155, __noswap_vget_lane_s32(__rev1_155, __p2_155)); \ - __ret_155; \ +#define vqdmulls_lane_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \ + int32_t __s0_167 = __p0_167; \ + int32x2_t __s1_167 = __p1_167; \ + int32x2_t __rev1_167; __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 1, 0); \ + int64_t __ret_167; \ + __ret_167 = __noswap_vqdmulls_s32(__s0_167, __noswap_vget_lane_s32(__rev1_167, __p2_167)); \ + __ret_167; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmullh_lane_s16(__p0_156, __p1_156, __p2_156) __extension__ ({ \ - int16_t __s0_156 = __p0_156; \ - int16x4_t __s1_156 = __p1_156; \ - int32_t __ret_156; \ - __ret_156 = vqdmullh_s16(__s0_156, vget_lane_s16(__s1_156, __p2_156)); \ - __ret_156; \ +#define vqdmullh_lane_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \ + int16_t __s0_168 = __p0_168; \ + int16x4_t __s1_168 = __p1_168; \ + int32_t __ret_168; \ + __ret_168 = vqdmullh_s16(__s0_168, vget_lane_s16(__s1_168, __p2_168)); \ + __ret_168; \ }) #else -#define vqdmullh_lane_s16(__p0_157, __p1_157, __p2_157) __extension__ ({ \ - int16_t __s0_157 = __p0_157; \ - int16x4_t __s1_157 = __p1_157; \ - int16x4_t __rev1_157; __rev1_157 = __builtin_shufflevector(__s1_157, __s1_157, 3, 2, 1, 0); \ - int32_t __ret_157; \ - __ret_157 = __noswap_vqdmullh_s16(__s0_157, __noswap_vget_lane_s16(__rev1_157, __p2_157)); \ - __ret_157; \ +#define vqdmullh_lane_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \ + int16_t __s0_169 = __p0_169; \ + int16x4_t __s1_169 = __p1_169; \ + int16x4_t __rev1_169; __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 3, 2, 1, 0); \ + int32_t __ret_169; \ + __ret_169 = __noswap_vqdmullh_s16(__s0_169, __noswap_vget_lane_s16(__rev1_169, __p2_169)); \ + __ret_169; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmulls_laneq_s32(__p0_158, __p1_158, __p2_158) __extension__ ({ \ - int32_t __s0_158 = __p0_158; \ - int32x4_t __s1_158 = __p1_158; \ - int64_t __ret_158; \ - __ret_158 = vqdmulls_s32(__s0_158, vgetq_lane_s32(__s1_158, __p2_158)); \ - __ret_158; \ +#define vqdmulls_laneq_s32(__p0_170, __p1_170, __p2_170) __extension__ ({ \ + int32_t __s0_170 = __p0_170; \ + int32x4_t __s1_170 = __p1_170; \ + int64_t __ret_170; \ + __ret_170 = vqdmulls_s32(__s0_170, vgetq_lane_s32(__s1_170, __p2_170)); \ + __ret_170; \ }) #else -#define vqdmulls_laneq_s32(__p0_159, __p1_159, __p2_159) __extension__ ({ \ - int32_t __s0_159 = __p0_159; \ - int32x4_t __s1_159 = __p1_159; \ - int32x4_t __rev1_159; __rev1_159 = __builtin_shufflevector(__s1_159, __s1_159, 3, 2, 1, 0); \ - int64_t __ret_159; \ - __ret_159 = __noswap_vqdmulls_s32(__s0_159, __noswap_vgetq_lane_s32(__rev1_159, __p2_159)); \ - __ret_159; \ +#define vqdmulls_laneq_s32(__p0_171, __p1_171, __p2_171) __extension__ ({ \ + int32_t __s0_171 = __p0_171; \ + int32x4_t __s1_171 = __p1_171; \ + int32x4_t __rev1_171; __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \ + int64_t __ret_171; \ + __ret_171 = __noswap_vqdmulls_s32(__s0_171, __noswap_vgetq_lane_s32(__rev1_171, __p2_171)); \ + __ret_171; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqdmullh_laneq_s16(__p0_160, __p1_160, __p2_160) __extension__ ({ \ - int16_t __s0_160 = __p0_160; \ - int16x8_t __s1_160 = __p1_160; \ - int32_t __ret_160; \ - __ret_160 = vqdmullh_s16(__s0_160, vgetq_lane_s16(__s1_160, __p2_160)); \ - __ret_160; \ +#define vqdmullh_laneq_s16(__p0_172, __p1_172, __p2_172) __extension__ ({ \ + int16_t __s0_172 = __p0_172; \ + int16x8_t __s1_172 = __p1_172; \ + int32_t __ret_172; \ + __ret_172 = vqdmullh_s16(__s0_172, vgetq_lane_s16(__s1_172, __p2_172)); \ + __ret_172; \ }) #else -#define vqdmullh_laneq_s16(__p0_161, __p1_161, __p2_161) __extension__ ({ \ - int16_t __s0_161 = __p0_161; \ - int16x8_t __s1_161 = __p1_161; \ - int16x8_t __rev1_161; __rev1_161 = __builtin_shufflevector(__s1_161, __s1_161, 7, 6, 5, 4, 3, 2, 1, 0); \ - int32_t __ret_161; \ - __ret_161 = __noswap_vqdmullh_s16(__s0_161, __noswap_vgetq_lane_s16(__rev1_161, __p2_161)); \ - __ret_161; \ +#define vqdmullh_laneq_s16(__p0_173, __p1_173, __p2_173) __extension__ ({ \ + int16_t __s0_173 = __p0_173; \ + int16x8_t __s1_173 = __p1_173; \ + int16x8_t __rev1_173; __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32_t __ret_173; \ + __ret_173 = __noswap_vqdmullh_s16(__s0_173, __noswap_vgetq_lane_s16(__rev1_173, __p2_173)); \ + __ret_173; \ }) #endif @@ -57544,78 +60856,78 @@ __ai int16_t __noswap_vqrdmulhh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhs_lane_s32(__p0_162, __p1_162, __p2_162) __extension__ ({ \ - int32_t __s0_162 = __p0_162; \ - int32x2_t __s1_162 = __p1_162; \ - int32_t __ret_162; \ - __ret_162 = vqrdmulhs_s32(__s0_162, vget_lane_s32(__s1_162, __p2_162)); \ - __ret_162; \ +#define vqrdmulhs_lane_s32(__p0_174, __p1_174, __p2_174) __extension__ ({ \ + int32_t __s0_174 = __p0_174; \ + int32x2_t __s1_174 = __p1_174; \ + int32_t __ret_174; \ + __ret_174 = vqrdmulhs_s32(__s0_174, vget_lane_s32(__s1_174, __p2_174)); \ + __ret_174; \ }) #else -#define vqrdmulhs_lane_s32(__p0_163, __p1_163, __p2_163) __extension__ ({ \ - int32_t __s0_163 = __p0_163; \ - int32x2_t __s1_163 = __p1_163; \ - int32x2_t __rev1_163; __rev1_163 = __builtin_shufflevector(__s1_163, __s1_163, 1, 0); \ - int32_t __ret_163; \ - __ret_163 = __noswap_vqrdmulhs_s32(__s0_163, __noswap_vget_lane_s32(__rev1_163, __p2_163)); \ - __ret_163; \ +#define vqrdmulhs_lane_s32(__p0_175, __p1_175, __p2_175) __extension__ ({ \ + int32_t __s0_175 = __p0_175; \ + int32x2_t __s1_175 = __p1_175; \ + int32x2_t __rev1_175; __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 1, 0); \ + int32_t __ret_175; \ + __ret_175 = __noswap_vqrdmulhs_s32(__s0_175, __noswap_vget_lane_s32(__rev1_175, __p2_175)); \ + __ret_175; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhh_lane_s16(__p0_164, __p1_164, __p2_164) __extension__ ({ \ - int16_t __s0_164 = __p0_164; \ - int16x4_t __s1_164 = __p1_164; \ - int16_t __ret_164; \ - __ret_164 = vqrdmulhh_s16(__s0_164, vget_lane_s16(__s1_164, __p2_164)); \ - __ret_164; \ +#define vqrdmulhh_lane_s16(__p0_176, __p1_176, __p2_176) __extension__ ({ \ + int16_t __s0_176 = __p0_176; \ + int16x4_t __s1_176 = __p1_176; \ + int16_t __ret_176; \ + __ret_176 = vqrdmulhh_s16(__s0_176, vget_lane_s16(__s1_176, __p2_176)); \ + __ret_176; \ }) #else -#define vqrdmulhh_lane_s16(__p0_165, __p1_165, __p2_165) __extension__ ({ \ - int16_t __s0_165 = __p0_165; \ - int16x4_t __s1_165 = __p1_165; \ - int16x4_t __rev1_165; __rev1_165 = __builtin_shufflevector(__s1_165, __s1_165, 3, 2, 1, 0); \ - int16_t __ret_165; \ - __ret_165 = __noswap_vqrdmulhh_s16(__s0_165, __noswap_vget_lane_s16(__rev1_165, __p2_165)); \ - __ret_165; \ +#define vqrdmulhh_lane_s16(__p0_177, __p1_177, __p2_177) __extension__ ({ \ + int16_t __s0_177 = __p0_177; \ + int16x4_t __s1_177 = __p1_177; \ + int16x4_t __rev1_177; __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \ + int16_t __ret_177; \ + __ret_177 = __noswap_vqrdmulhh_s16(__s0_177, __noswap_vget_lane_s16(__rev1_177, __p2_177)); \ + __ret_177; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhs_laneq_s32(__p0_166, __p1_166, __p2_166) __extension__ ({ \ - int32_t __s0_166 = __p0_166; \ - int32x4_t __s1_166 = __p1_166; \ - int32_t __ret_166; \ - __ret_166 = vqrdmulhs_s32(__s0_166, vgetq_lane_s32(__s1_166, __p2_166)); \ - __ret_166; \ +#define vqrdmulhs_laneq_s32(__p0_178, __p1_178, __p2_178) __extension__ ({ \ + int32_t __s0_178 = __p0_178; \ + int32x4_t __s1_178 = __p1_178; \ + int32_t __ret_178; \ + __ret_178 = vqrdmulhs_s32(__s0_178, vgetq_lane_s32(__s1_178, __p2_178)); \ + __ret_178; \ }) #else -#define vqrdmulhs_laneq_s32(__p0_167, __p1_167, __p2_167) __extension__ ({ \ - int32_t __s0_167 = __p0_167; \ - int32x4_t __s1_167 = __p1_167; \ - int32x4_t __rev1_167; __rev1_167 = __builtin_shufflevector(__s1_167, __s1_167, 3, 2, 1, 0); \ - int32_t __ret_167; \ - __ret_167 = __noswap_vqrdmulhs_s32(__s0_167, __noswap_vgetq_lane_s32(__rev1_167, __p2_167)); \ - __ret_167; \ +#define vqrdmulhs_laneq_s32(__p0_179, __p1_179, __p2_179) __extension__ ({ \ + int32_t __s0_179 = __p0_179; \ + int32x4_t __s1_179 = __p1_179; \ + int32x4_t __rev1_179; __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 3, 2, 1, 0); \ + int32_t __ret_179; \ + __ret_179 = __noswap_vqrdmulhs_s32(__s0_179, __noswap_vgetq_lane_s32(__rev1_179, __p2_179)); \ + __ret_179; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmulhh_laneq_s16(__p0_168, __p1_168, __p2_168) __extension__ ({ \ - int16_t __s0_168 = __p0_168; \ - int16x8_t __s1_168 = __p1_168; \ - int16_t __ret_168; \ - __ret_168 = vqrdmulhh_s16(__s0_168, vgetq_lane_s16(__s1_168, __p2_168)); \ - __ret_168; \ +#define vqrdmulhh_laneq_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \ + int16_t __s0_180 = __p0_180; \ + int16x8_t __s1_180 = __p1_180; \ + int16_t __ret_180; \ + __ret_180 = vqrdmulhh_s16(__s0_180, vgetq_lane_s16(__s1_180, __p2_180)); \ + __ret_180; \ }) #else -#define vqrdmulhh_laneq_s16(__p0_169, __p1_169, __p2_169) __extension__ ({ \ - int16_t __s0_169 = __p0_169; \ - int16x8_t __s1_169 = __p1_169; \ - int16x8_t __rev1_169; __rev1_169 = __builtin_shufflevector(__s1_169, __s1_169, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_169; \ - __ret_169 = __noswap_vqrdmulhh_s16(__s0_169, __noswap_vgetq_lane_s16(__rev1_169, __p2_169)); \ - __ret_169; \ +#define vqrdmulhh_laneq_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \ + int16_t __s0_181 = __p0_181; \ + int16x8_t __s1_181 = __p1_181; \ + int16x8_t __rev1_181; __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_181; \ + __ret_181 = __noswap_vqrdmulhh_s16(__s0_181, __noswap_vgetq_lane_s16(__rev1_181, __p2_181)); \ + __ret_181; \ }) #endif @@ -57816,128 +61128,128 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_u32(__p0_170, __p1_170, __p2_170) __extension__ ({ \ - uint16x4_t __s0_170 = __p0_170; \ - uint32x4_t __s1_170 = __p1_170; \ - uint16x8_t __ret_170; \ - __ret_170 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_170), (uint16x4_t)(vqrshrn_n_u32(__s1_170, __p2_170)))); \ - __ret_170; \ +#define vqrshrn_high_n_u32(__p0_182, __p1_182, __p2_182) __extension__ ({ \ + uint16x4_t __s0_182 = __p0_182; \ + uint32x4_t __s1_182 = __p1_182; \ + uint16x8_t __ret_182; \ + __ret_182 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_182), (uint16x4_t)(vqrshrn_n_u32(__s1_182, __p2_182)))); \ + __ret_182; \ }) #else -#define vqrshrn_high_n_u32(__p0_171, __p1_171, __p2_171) __extension__ ({ \ - uint16x4_t __s0_171 = __p0_171; \ - uint32x4_t __s1_171 = __p1_171; \ - uint16x4_t __rev0_171; __rev0_171 = __builtin_shufflevector(__s0_171, __s0_171, 3, 2, 1, 0); \ - uint32x4_t __rev1_171; __rev1_171 = __builtin_shufflevector(__s1_171, __s1_171, 3, 2, 1, 0); \ - uint16x8_t __ret_171; \ - __ret_171 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_171), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_171, __p2_171)))); \ - __ret_171 = __builtin_shufflevector(__ret_171, __ret_171, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_171; \ +#define vqrshrn_high_n_u32(__p0_183, __p1_183, __p2_183) __extension__ ({ \ + uint16x4_t __s0_183 = __p0_183; \ + uint32x4_t __s1_183 = __p1_183; \ + uint16x4_t __rev0_183; __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \ + uint32x4_t __rev1_183; __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \ + uint16x8_t __ret_183; \ + __ret_183 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_183), (uint16x4_t)(__noswap_vqrshrn_n_u32(__rev1_183, __p2_183)))); \ + __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_183; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_u64(__p0_172, __p1_172, __p2_172) __extension__ ({ \ - uint32x2_t __s0_172 = __p0_172; \ - uint64x2_t __s1_172 = __p1_172; \ - uint32x4_t __ret_172; \ - __ret_172 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_172), (uint32x2_t)(vqrshrn_n_u64(__s1_172, __p2_172)))); \ - __ret_172; \ +#define vqrshrn_high_n_u64(__p0_184, __p1_184, __p2_184) __extension__ ({ \ + uint32x2_t __s0_184 = __p0_184; \ + uint64x2_t __s1_184 = __p1_184; \ + uint32x4_t __ret_184; \ + __ret_184 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_184), (uint32x2_t)(vqrshrn_n_u64(__s1_184, __p2_184)))); \ + __ret_184; \ }) #else -#define vqrshrn_high_n_u64(__p0_173, __p1_173, __p2_173) __extension__ ({ \ - uint32x2_t __s0_173 = __p0_173; \ - uint64x2_t __s1_173 = __p1_173; \ - uint32x2_t __rev0_173; __rev0_173 = __builtin_shufflevector(__s0_173, __s0_173, 1, 0); \ - uint64x2_t __rev1_173; __rev1_173 = __builtin_shufflevector(__s1_173, __s1_173, 1, 0); \ - uint32x4_t __ret_173; \ - __ret_173 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_173), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_173, __p2_173)))); \ - __ret_173 = __builtin_shufflevector(__ret_173, __ret_173, 3, 2, 1, 0); \ - __ret_173; \ +#define vqrshrn_high_n_u64(__p0_185, __p1_185, __p2_185) __extension__ ({ \ + uint32x2_t __s0_185 = __p0_185; \ + uint64x2_t __s1_185 = __p1_185; \ + uint32x2_t __rev0_185; __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \ + uint64x2_t __rev1_185; __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \ + uint32x4_t __ret_185; \ + __ret_185 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_185), (uint32x2_t)(__noswap_vqrshrn_n_u64(__rev1_185, __p2_185)))); \ + __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \ + __ret_185; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_u16(__p0_174, __p1_174, __p2_174) __extension__ ({ \ - uint8x8_t __s0_174 = __p0_174; \ - uint16x8_t __s1_174 = __p1_174; \ - uint8x16_t __ret_174; \ - __ret_174 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_174), (uint8x8_t)(vqrshrn_n_u16(__s1_174, __p2_174)))); \ - __ret_174; \ +#define vqrshrn_high_n_u16(__p0_186, __p1_186, __p2_186) __extension__ ({ \ + uint8x8_t __s0_186 = __p0_186; \ + uint16x8_t __s1_186 = __p1_186; \ + uint8x16_t __ret_186; \ + __ret_186 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_186), (uint8x8_t)(vqrshrn_n_u16(__s1_186, __p2_186)))); \ + __ret_186; \ }) #else -#define vqrshrn_high_n_u16(__p0_175, __p1_175, __p2_175) __extension__ ({ \ - uint8x8_t __s0_175 = __p0_175; \ - uint16x8_t __s1_175 = __p1_175; \ - uint8x8_t __rev0_175; __rev0_175 = __builtin_shufflevector(__s0_175, __s0_175, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_175; __rev1_175 = __builtin_shufflevector(__s1_175, __s1_175, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_175; \ - __ret_175 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_175), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_175, __p2_175)))); \ - __ret_175 = __builtin_shufflevector(__ret_175, __ret_175, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_175; \ +#define vqrshrn_high_n_u16(__p0_187, __p1_187, __p2_187) __extension__ ({ \ + uint8x8_t __s0_187 = __p0_187; \ + uint16x8_t __s1_187 = __p1_187; \ + uint8x8_t __rev0_187; __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_187; __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_187; \ + __ret_187 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_187), (uint8x8_t)(__noswap_vqrshrn_n_u16(__rev1_187, __p2_187)))); \ + __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_187; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_s32(__p0_176, __p1_176, __p2_176) __extension__ ({ \ - int16x4_t __s0_176 = __p0_176; \ - int32x4_t __s1_176 = __p1_176; \ - int16x8_t __ret_176; \ - __ret_176 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_176), (int16x4_t)(vqrshrn_n_s32(__s1_176, __p2_176)))); \ - __ret_176; \ +#define vqrshrn_high_n_s32(__p0_188, __p1_188, __p2_188) __extension__ ({ \ + int16x4_t __s0_188 = __p0_188; \ + int32x4_t __s1_188 = __p1_188; \ + int16x8_t __ret_188; \ + __ret_188 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_188), (int16x4_t)(vqrshrn_n_s32(__s1_188, __p2_188)))); \ + __ret_188; \ }) #else -#define vqrshrn_high_n_s32(__p0_177, __p1_177, __p2_177) __extension__ ({ \ - int16x4_t __s0_177 = __p0_177; \ - int32x4_t __s1_177 = __p1_177; \ - int16x4_t __rev0_177; __rev0_177 = __builtin_shufflevector(__s0_177, __s0_177, 3, 2, 1, 0); \ - int32x4_t __rev1_177; __rev1_177 = __builtin_shufflevector(__s1_177, __s1_177, 3, 2, 1, 0); \ - int16x8_t __ret_177; \ - __ret_177 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_177), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_177, __p2_177)))); \ - __ret_177 = __builtin_shufflevector(__ret_177, __ret_177, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_177; \ +#define vqrshrn_high_n_s32(__p0_189, __p1_189, __p2_189) __extension__ ({ \ + int16x4_t __s0_189 = __p0_189; \ + int32x4_t __s1_189 = __p1_189; \ + int16x4_t __rev0_189; __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \ + int32x4_t __rev1_189; __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \ + int16x8_t __ret_189; \ + __ret_189 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_189), (int16x4_t)(__noswap_vqrshrn_n_s32(__rev1_189, __p2_189)))); \ + __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_189; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_s64(__p0_178, __p1_178, __p2_178) __extension__ ({ \ - int32x2_t __s0_178 = __p0_178; \ - int64x2_t __s1_178 = __p1_178; \ - int32x4_t __ret_178; \ - __ret_178 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_178), (int32x2_t)(vqrshrn_n_s64(__s1_178, __p2_178)))); \ - __ret_178; \ +#define vqrshrn_high_n_s64(__p0_190, __p1_190, __p2_190) __extension__ ({ \ + int32x2_t __s0_190 = __p0_190; \ + int64x2_t __s1_190 = __p1_190; \ + int32x4_t __ret_190; \ + __ret_190 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_190), (int32x2_t)(vqrshrn_n_s64(__s1_190, __p2_190)))); \ + __ret_190; \ }) #else -#define vqrshrn_high_n_s64(__p0_179, __p1_179, __p2_179) __extension__ ({ \ - int32x2_t __s0_179 = __p0_179; \ - int64x2_t __s1_179 = __p1_179; \ - int32x2_t __rev0_179; __rev0_179 = __builtin_shufflevector(__s0_179, __s0_179, 1, 0); \ - int64x2_t __rev1_179; __rev1_179 = __builtin_shufflevector(__s1_179, __s1_179, 1, 0); \ - int32x4_t __ret_179; \ - __ret_179 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_179), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_179, __p2_179)))); \ - __ret_179 = __builtin_shufflevector(__ret_179, __ret_179, 3, 2, 1, 0); \ - __ret_179; \ +#define vqrshrn_high_n_s64(__p0_191, __p1_191, __p2_191) __extension__ ({ \ + int32x2_t __s0_191 = __p0_191; \ + int64x2_t __s1_191 = __p1_191; \ + int32x2_t __rev0_191; __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \ + int64x2_t __rev1_191; __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \ + int32x4_t __ret_191; \ + __ret_191 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_191), (int32x2_t)(__noswap_vqrshrn_n_s64(__rev1_191, __p2_191)))); \ + __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 3, 2, 1, 0); \ + __ret_191; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrn_high_n_s16(__p0_180, __p1_180, __p2_180) __extension__ ({ \ - int8x8_t __s0_180 = __p0_180; \ - int16x8_t __s1_180 = __p1_180; \ - int8x16_t __ret_180; \ - __ret_180 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_180), (int8x8_t)(vqrshrn_n_s16(__s1_180, __p2_180)))); \ - __ret_180; \ +#define vqrshrn_high_n_s16(__p0_192, __p1_192, __p2_192) __extension__ ({ \ + int8x8_t __s0_192 = __p0_192; \ + int16x8_t __s1_192 = __p1_192; \ + int8x16_t __ret_192; \ + __ret_192 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_192), (int8x8_t)(vqrshrn_n_s16(__s1_192, __p2_192)))); \ + __ret_192; \ }) #else -#define vqrshrn_high_n_s16(__p0_181, __p1_181, __p2_181) __extension__ ({ \ - int8x8_t __s0_181 = __p0_181; \ - int16x8_t __s1_181 = __p1_181; \ - int8x8_t __rev0_181; __rev0_181 = __builtin_shufflevector(__s0_181, __s0_181, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_181; __rev1_181 = __builtin_shufflevector(__s1_181, __s1_181, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_181; \ - __ret_181 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_181), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_181, __p2_181)))); \ - __ret_181 = __builtin_shufflevector(__ret_181, __ret_181, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_181; \ +#define vqrshrn_high_n_s16(__p0_193, __p1_193, __p2_193) __extension__ ({ \ + int8x8_t __s0_193 = __p0_193; \ + int16x8_t __s1_193 = __p1_193; \ + int8x8_t __rev0_193; __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_193; __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_193; \ + __ret_193 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_193), (int8x8_t)(__noswap_vqrshrn_n_s16(__rev1_193, __p2_193)))); \ + __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_193; \ }) #endif @@ -58038,65 +61350,65 @@ __ai int16_t vqrshlh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrun_high_n_s32(__p0_182, __p1_182, __p2_182) __extension__ ({ \ - int16x4_t __s0_182 = __p0_182; \ - int32x4_t __s1_182 = __p1_182; \ - int16x8_t __ret_182; \ - __ret_182 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_182), (int16x4_t)(vqrshrun_n_s32(__s1_182, __p2_182)))); \ - __ret_182; \ +#define vqrshrun_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \ + int16x4_t __s0_194 = __p0_194; \ + int32x4_t __s1_194 = __p1_194; \ + int16x8_t __ret_194; \ + __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqrshrun_n_s32(__s1_194, __p2_194)))); \ + __ret_194; \ }) #else -#define vqrshrun_high_n_s32(__p0_183, __p1_183, __p2_183) __extension__ ({ \ - int16x4_t __s0_183 = __p0_183; \ - int32x4_t __s1_183 = __p1_183; \ - int16x4_t __rev0_183; __rev0_183 = __builtin_shufflevector(__s0_183, __s0_183, 3, 2, 1, 0); \ - int32x4_t __rev1_183; __rev1_183 = __builtin_shufflevector(__s1_183, __s1_183, 3, 2, 1, 0); \ - int16x8_t __ret_183; \ - __ret_183 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_183), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_183, __p2_183)))); \ - __ret_183 = __builtin_shufflevector(__ret_183, __ret_183, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_183; \ +#define vqrshrun_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \ + int16x4_t __s0_195 = __p0_195; \ + int32x4_t __s1_195 = __p1_195; \ + int16x4_t __rev0_195; __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 3, 2, 1, 0); \ + int32x4_t __rev1_195; __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 3, 2, 1, 0); \ + int16x8_t __ret_195; \ + __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqrshrun_n_s32(__rev1_195, __p2_195)))); \ + __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_195; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrun_high_n_s64(__p0_184, __p1_184, __p2_184) __extension__ ({ \ - int32x2_t __s0_184 = __p0_184; \ - int64x2_t __s1_184 = __p1_184; \ - int32x4_t __ret_184; \ - __ret_184 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_184), (int32x2_t)(vqrshrun_n_s64(__s1_184, __p2_184)))); \ - __ret_184; \ +#define vqrshrun_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \ + int32x2_t __s0_196 = __p0_196; \ + int64x2_t __s1_196 = __p1_196; \ + int32x4_t __ret_196; \ + __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqrshrun_n_s64(__s1_196, __p2_196)))); \ + __ret_196; \ }) #else -#define vqrshrun_high_n_s64(__p0_185, __p1_185, __p2_185) __extension__ ({ \ - int32x2_t __s0_185 = __p0_185; \ - int64x2_t __s1_185 = __p1_185; \ - int32x2_t __rev0_185; __rev0_185 = __builtin_shufflevector(__s0_185, __s0_185, 1, 0); \ - int64x2_t __rev1_185; __rev1_185 = __builtin_shufflevector(__s1_185, __s1_185, 1, 0); \ - int32x4_t __ret_185; \ - __ret_185 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_185), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_185, __p2_185)))); \ - __ret_185 = __builtin_shufflevector(__ret_185, __ret_185, 3, 2, 1, 0); \ - __ret_185; \ +#define vqrshrun_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \ + int32x2_t __s0_197 = __p0_197; \ + int64x2_t __s1_197 = __p1_197; \ + int32x2_t __rev0_197; __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 1, 0); \ + int64x2_t __rev1_197; __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 1, 0); \ + int32x4_t __ret_197; \ + __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqrshrun_n_s64(__rev1_197, __p2_197)))); \ + __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \ + __ret_197; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrshrun_high_n_s16(__p0_186, __p1_186, __p2_186) __extension__ ({ \ - int8x8_t __s0_186 = __p0_186; \ - int16x8_t __s1_186 = __p1_186; \ - int8x16_t __ret_186; \ - __ret_186 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_186), (int8x8_t)(vqrshrun_n_s16(__s1_186, __p2_186)))); \ - __ret_186; \ +#define vqrshrun_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \ + int8x8_t __s0_198 = __p0_198; \ + int16x8_t __s1_198 = __p1_198; \ + int8x16_t __ret_198; \ + __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqrshrun_n_s16(__s1_198, __p2_198)))); \ + __ret_198; \ }) #else -#define vqrshrun_high_n_s16(__p0_187, __p1_187, __p2_187) __extension__ ({ \ - int8x8_t __s0_187 = __p0_187; \ - int16x8_t __s1_187 = __p1_187; \ - int8x8_t __rev0_187; __rev0_187 = __builtin_shufflevector(__s0_187, __s0_187, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_187; __rev1_187 = __builtin_shufflevector(__s1_187, __s1_187, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_187; \ - __ret_187 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_187), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_187, __p2_187)))); \ - __ret_187 = __builtin_shufflevector(__ret_187, __ret_187, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_187; \ +#define vqrshrun_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \ + int8x8_t __s0_199 = __p0_199; \ + int16x8_t __s1_199 = __p1_199; \ + int8x8_t __rev0_199; __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_199; __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_199; \ + __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqrshrun_n_s16(__rev1_199, __p2_199)))); \ + __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_199; \ }) #endif @@ -58453,128 +61765,128 @@ __ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_u32(__p0_188, __p1_188, __p2_188) __extension__ ({ \ - uint16x4_t __s0_188 = __p0_188; \ - uint32x4_t __s1_188 = __p1_188; \ - uint16x8_t __ret_188; \ - __ret_188 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_188), (uint16x4_t)(vqshrn_n_u32(__s1_188, __p2_188)))); \ - __ret_188; \ +#define vqshrn_high_n_u32(__p0_200, __p1_200, __p2_200) __extension__ ({ \ + uint16x4_t __s0_200 = __p0_200; \ + uint32x4_t __s1_200 = __p1_200; \ + uint16x8_t __ret_200; \ + __ret_200 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_200), (uint16x4_t)(vqshrn_n_u32(__s1_200, __p2_200)))); \ + __ret_200; \ }) #else -#define vqshrn_high_n_u32(__p0_189, __p1_189, __p2_189) __extension__ ({ \ - uint16x4_t __s0_189 = __p0_189; \ - uint32x4_t __s1_189 = __p1_189; \ - uint16x4_t __rev0_189; __rev0_189 = __builtin_shufflevector(__s0_189, __s0_189, 3, 2, 1, 0); \ - uint32x4_t __rev1_189; __rev1_189 = __builtin_shufflevector(__s1_189, __s1_189, 3, 2, 1, 0); \ - uint16x8_t __ret_189; \ - __ret_189 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_189), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_189, __p2_189)))); \ - __ret_189 = __builtin_shufflevector(__ret_189, __ret_189, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_189; \ +#define vqshrn_high_n_u32(__p0_201, __p1_201, __p2_201) __extension__ ({ \ + uint16x4_t __s0_201 = __p0_201; \ + uint32x4_t __s1_201 = __p1_201; \ + uint16x4_t __rev0_201; __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \ + uint32x4_t __rev1_201; __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \ + uint16x8_t __ret_201; \ + __ret_201 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_201), (uint16x4_t)(__noswap_vqshrn_n_u32(__rev1_201, __p2_201)))); \ + __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_201; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_u64(__p0_190, __p1_190, __p2_190) __extension__ ({ \ - uint32x2_t __s0_190 = __p0_190; \ - uint64x2_t __s1_190 = __p1_190; \ - uint32x4_t __ret_190; \ - __ret_190 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_190), (uint32x2_t)(vqshrn_n_u64(__s1_190, __p2_190)))); \ - __ret_190; \ +#define vqshrn_high_n_u64(__p0_202, __p1_202, __p2_202) __extension__ ({ \ + uint32x2_t __s0_202 = __p0_202; \ + uint64x2_t __s1_202 = __p1_202; \ + uint32x4_t __ret_202; \ + __ret_202 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_202), (uint32x2_t)(vqshrn_n_u64(__s1_202, __p2_202)))); \ + __ret_202; \ }) #else -#define vqshrn_high_n_u64(__p0_191, __p1_191, __p2_191) __extension__ ({ \ - uint32x2_t __s0_191 = __p0_191; \ - uint64x2_t __s1_191 = __p1_191; \ - uint32x2_t __rev0_191; __rev0_191 = __builtin_shufflevector(__s0_191, __s0_191, 1, 0); \ - uint64x2_t __rev1_191; __rev1_191 = __builtin_shufflevector(__s1_191, __s1_191, 1, 0); \ - uint32x4_t __ret_191; \ - __ret_191 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_191), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_191, __p2_191)))); \ - __ret_191 = __builtin_shufflevector(__ret_191, __ret_191, 3, 2, 1, 0); \ - __ret_191; \ +#define vqshrn_high_n_u64(__p0_203, __p1_203, __p2_203) __extension__ ({ \ + uint32x2_t __s0_203 = __p0_203; \ + uint64x2_t __s1_203 = __p1_203; \ + uint32x2_t __rev0_203; __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \ + uint64x2_t __rev1_203; __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \ + uint32x4_t __ret_203; \ + __ret_203 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_203), (uint32x2_t)(__noswap_vqshrn_n_u64(__rev1_203, __p2_203)))); \ + __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 3, 2, 1, 0); \ + __ret_203; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_u16(__p0_192, __p1_192, __p2_192) __extension__ ({ \ - uint8x8_t __s0_192 = __p0_192; \ - uint16x8_t __s1_192 = __p1_192; \ - uint8x16_t __ret_192; \ - __ret_192 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_192), (uint8x8_t)(vqshrn_n_u16(__s1_192, __p2_192)))); \ - __ret_192; \ +#define vqshrn_high_n_u16(__p0_204, __p1_204, __p2_204) __extension__ ({ \ + uint8x8_t __s0_204 = __p0_204; \ + uint16x8_t __s1_204 = __p1_204; \ + uint8x16_t __ret_204; \ + __ret_204 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_204), (uint8x8_t)(vqshrn_n_u16(__s1_204, __p2_204)))); \ + __ret_204; \ }) #else -#define vqshrn_high_n_u16(__p0_193, __p1_193, __p2_193) __extension__ ({ \ - uint8x8_t __s0_193 = __p0_193; \ - uint16x8_t __s1_193 = __p1_193; \ - uint8x8_t __rev0_193; __rev0_193 = __builtin_shufflevector(__s0_193, __s0_193, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_193; __rev1_193 = __builtin_shufflevector(__s1_193, __s1_193, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_193; \ - __ret_193 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_193), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_193, __p2_193)))); \ - __ret_193 = __builtin_shufflevector(__ret_193, __ret_193, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_193; \ +#define vqshrn_high_n_u16(__p0_205, __p1_205, __p2_205) __extension__ ({ \ + uint8x8_t __s0_205 = __p0_205; \ + uint16x8_t __s1_205 = __p1_205; \ + uint8x8_t __rev0_205; __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_205; __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_205; \ + __ret_205 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_205), (uint8x8_t)(__noswap_vqshrn_n_u16(__rev1_205, __p2_205)))); \ + __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_205; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_s32(__p0_194, __p1_194, __p2_194) __extension__ ({ \ - int16x4_t __s0_194 = __p0_194; \ - int32x4_t __s1_194 = __p1_194; \ - int16x8_t __ret_194; \ - __ret_194 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_194), (int16x4_t)(vqshrn_n_s32(__s1_194, __p2_194)))); \ - __ret_194; \ +#define vqshrn_high_n_s32(__p0_206, __p1_206, __p2_206) __extension__ ({ \ + int16x4_t __s0_206 = __p0_206; \ + int32x4_t __s1_206 = __p1_206; \ + int16x8_t __ret_206; \ + __ret_206 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_206), (int16x4_t)(vqshrn_n_s32(__s1_206, __p2_206)))); \ + __ret_206; \ }) #else -#define vqshrn_high_n_s32(__p0_195, __p1_195, __p2_195) __extension__ ({ \ - int16x4_t __s0_195 = __p0_195; \ - int32x4_t __s1_195 = __p1_195; \ - int16x4_t __rev0_195; __rev0_195 = __builtin_shufflevector(__s0_195, __s0_195, 3, 2, 1, 0); \ - int32x4_t __rev1_195; __rev1_195 = __builtin_shufflevector(__s1_195, __s1_195, 3, 2, 1, 0); \ - int16x8_t __ret_195; \ - __ret_195 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_195), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_195, __p2_195)))); \ - __ret_195 = __builtin_shufflevector(__ret_195, __ret_195, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_195; \ +#define vqshrn_high_n_s32(__p0_207, __p1_207, __p2_207) __extension__ ({ \ + int16x4_t __s0_207 = __p0_207; \ + int32x4_t __s1_207 = __p1_207; \ + int16x4_t __rev0_207; __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \ + int32x4_t __rev1_207; __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \ + int16x8_t __ret_207; \ + __ret_207 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_207), (int16x4_t)(__noswap_vqshrn_n_s32(__rev1_207, __p2_207)))); \ + __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_207; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_s64(__p0_196, __p1_196, __p2_196) __extension__ ({ \ - int32x2_t __s0_196 = __p0_196; \ - int64x2_t __s1_196 = __p1_196; \ - int32x4_t __ret_196; \ - __ret_196 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_196), (int32x2_t)(vqshrn_n_s64(__s1_196, __p2_196)))); \ - __ret_196; \ +#define vqshrn_high_n_s64(__p0_208, __p1_208, __p2_208) __extension__ ({ \ + int32x2_t __s0_208 = __p0_208; \ + int64x2_t __s1_208 = __p1_208; \ + int32x4_t __ret_208; \ + __ret_208 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_208), (int32x2_t)(vqshrn_n_s64(__s1_208, __p2_208)))); \ + __ret_208; \ }) #else -#define vqshrn_high_n_s64(__p0_197, __p1_197, __p2_197) __extension__ ({ \ - int32x2_t __s0_197 = __p0_197; \ - int64x2_t __s1_197 = __p1_197; \ - int32x2_t __rev0_197; __rev0_197 = __builtin_shufflevector(__s0_197, __s0_197, 1, 0); \ - int64x2_t __rev1_197; __rev1_197 = __builtin_shufflevector(__s1_197, __s1_197, 1, 0); \ - int32x4_t __ret_197; \ - __ret_197 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_197), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_197, __p2_197)))); \ - __ret_197 = __builtin_shufflevector(__ret_197, __ret_197, 3, 2, 1, 0); \ - __ret_197; \ +#define vqshrn_high_n_s64(__p0_209, __p1_209, __p2_209) __extension__ ({ \ + int32x2_t __s0_209 = __p0_209; \ + int64x2_t __s1_209 = __p1_209; \ + int32x2_t __rev0_209; __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \ + int64x2_t __rev1_209; __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \ + int32x4_t __ret_209; \ + __ret_209 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_209), (int32x2_t)(__noswap_vqshrn_n_s64(__rev1_209, __p2_209)))); \ + __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \ + __ret_209; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrn_high_n_s16(__p0_198, __p1_198, __p2_198) __extension__ ({ \ - int8x8_t __s0_198 = __p0_198; \ - int16x8_t __s1_198 = __p1_198; \ - int8x16_t __ret_198; \ - __ret_198 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_198), (int8x8_t)(vqshrn_n_s16(__s1_198, __p2_198)))); \ - __ret_198; \ +#define vqshrn_high_n_s16(__p0_210, __p1_210, __p2_210) __extension__ ({ \ + int8x8_t __s0_210 = __p0_210; \ + int16x8_t __s1_210 = __p1_210; \ + int8x16_t __ret_210; \ + __ret_210 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_210), (int8x8_t)(vqshrn_n_s16(__s1_210, __p2_210)))); \ + __ret_210; \ }) #else -#define vqshrn_high_n_s16(__p0_199, __p1_199, __p2_199) __extension__ ({ \ - int8x8_t __s0_199 = __p0_199; \ - int16x8_t __s1_199 = __p1_199; \ - int8x8_t __rev0_199; __rev0_199 = __builtin_shufflevector(__s0_199, __s0_199, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_199; __rev1_199 = __builtin_shufflevector(__s1_199, __s1_199, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_199; \ - __ret_199 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_199), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_199, __p2_199)))); \ - __ret_199 = __builtin_shufflevector(__ret_199, __ret_199, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_199; \ +#define vqshrn_high_n_s16(__p0_211, __p1_211, __p2_211) __extension__ ({ \ + int8x8_t __s0_211 = __p0_211; \ + int16x8_t __s1_211 = __p1_211; \ + int8x8_t __rev0_211; __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_211; __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_211; \ + __ret_211 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_211), (int8x8_t)(__noswap_vqshrn_n_s16(__rev1_211, __p2_211)))); \ + __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_211; \ }) #endif @@ -58675,65 +61987,65 @@ __ai int16_t vqshlh_s16(int16_t __p0, int16_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrun_high_n_s32(__p0_200, __p1_200, __p2_200) __extension__ ({ \ - int16x4_t __s0_200 = __p0_200; \ - int32x4_t __s1_200 = __p1_200; \ - int16x8_t __ret_200; \ - __ret_200 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_200), (int16x4_t)(vqshrun_n_s32(__s1_200, __p2_200)))); \ - __ret_200; \ +#define vqshrun_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \ + int16x4_t __s0_212 = __p0_212; \ + int32x4_t __s1_212 = __p1_212; \ + int16x8_t __ret_212; \ + __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vqshrun_n_s32(__s1_212, __p2_212)))); \ + __ret_212; \ }) #else -#define vqshrun_high_n_s32(__p0_201, __p1_201, __p2_201) __extension__ ({ \ - int16x4_t __s0_201 = __p0_201; \ - int32x4_t __s1_201 = __p1_201; \ - int16x4_t __rev0_201; __rev0_201 = __builtin_shufflevector(__s0_201, __s0_201, 3, 2, 1, 0); \ - int32x4_t __rev1_201; __rev1_201 = __builtin_shufflevector(__s1_201, __s1_201, 3, 2, 1, 0); \ - int16x8_t __ret_201; \ - __ret_201 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_201), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_201, __p2_201)))); \ - __ret_201 = __builtin_shufflevector(__ret_201, __ret_201, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_201; \ +#define vqshrun_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \ + int16x4_t __s0_213 = __p0_213; \ + int32x4_t __s1_213 = __p1_213; \ + int16x4_t __rev0_213; __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \ + int32x4_t __rev1_213; __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \ + int16x8_t __ret_213; \ + __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vqshrun_n_s32(__rev1_213, __p2_213)))); \ + __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_213; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrun_high_n_s64(__p0_202, __p1_202, __p2_202) __extension__ ({ \ - int32x2_t __s0_202 = __p0_202; \ - int64x2_t __s1_202 = __p1_202; \ - int32x4_t __ret_202; \ - __ret_202 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_202), (int32x2_t)(vqshrun_n_s64(__s1_202, __p2_202)))); \ - __ret_202; \ +#define vqshrun_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \ + int32x2_t __s0_214 = __p0_214; \ + int64x2_t __s1_214 = __p1_214; \ + int32x4_t __ret_214; \ + __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vqshrun_n_s64(__s1_214, __p2_214)))); \ + __ret_214; \ }) #else -#define vqshrun_high_n_s64(__p0_203, __p1_203, __p2_203) __extension__ ({ \ - int32x2_t __s0_203 = __p0_203; \ - int64x2_t __s1_203 = __p1_203; \ - int32x2_t __rev0_203; __rev0_203 = __builtin_shufflevector(__s0_203, __s0_203, 1, 0); \ - int64x2_t __rev1_203; __rev1_203 = __builtin_shufflevector(__s1_203, __s1_203, 1, 0); \ - int32x4_t __ret_203; \ - __ret_203 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_203), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_203, __p2_203)))); \ - __ret_203 = __builtin_shufflevector(__ret_203, __ret_203, 3, 2, 1, 0); \ - __ret_203; \ +#define vqshrun_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \ + int32x2_t __s0_215 = __p0_215; \ + int64x2_t __s1_215 = __p1_215; \ + int32x2_t __rev0_215; __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \ + int64x2_t __rev1_215; __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \ + int32x4_t __ret_215; \ + __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vqshrun_n_s64(__rev1_215, __p2_215)))); \ + __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \ + __ret_215; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqshrun_high_n_s16(__p0_204, __p1_204, __p2_204) __extension__ ({ \ - int8x8_t __s0_204 = __p0_204; \ - int16x8_t __s1_204 = __p1_204; \ - int8x16_t __ret_204; \ - __ret_204 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_204), (int8x8_t)(vqshrun_n_s16(__s1_204, __p2_204)))); \ - __ret_204; \ +#define vqshrun_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \ + int8x8_t __s0_216 = __p0_216; \ + int16x8_t __s1_216 = __p1_216; \ + int8x16_t __ret_216; \ + __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vqshrun_n_s16(__s1_216, __p2_216)))); \ + __ret_216; \ }) #else -#define vqshrun_high_n_s16(__p0_205, __p1_205, __p2_205) __extension__ ({ \ - int8x8_t __s0_205 = __p0_205; \ - int16x8_t __s1_205 = __p1_205; \ - int8x8_t __rev0_205; __rev0_205 = __builtin_shufflevector(__s0_205, __s0_205, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_205; __rev1_205 = __builtin_shufflevector(__s1_205, __s1_205, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_205; \ - __ret_205 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_205), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_205, __p2_205)))); \ - __ret_205 = __builtin_shufflevector(__ret_205, __ret_205, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_205; \ +#define vqshrun_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \ + int8x8_t __s0_217 = __p0_217; \ + int16x8_t __s1_217 = __p1_217; \ + int8x8_t __rev0_217; __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_217; __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_217; \ + __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vqshrun_n_s16(__rev1_217, __p2_217)))); \ + __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_217; \ }) #endif @@ -60265,128 +63577,128 @@ __ai int64_t vrshld_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_u32(__p0_206, __p1_206, __p2_206) __extension__ ({ \ - uint16x4_t __s0_206 = __p0_206; \ - uint32x4_t __s1_206 = __p1_206; \ - uint16x8_t __ret_206; \ - __ret_206 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_206), (uint16x4_t)(vrshrn_n_u32(__s1_206, __p2_206)))); \ - __ret_206; \ +#define vrshrn_high_n_u32(__p0_218, __p1_218, __p2_218) __extension__ ({ \ + uint16x4_t __s0_218 = __p0_218; \ + uint32x4_t __s1_218 = __p1_218; \ + uint16x8_t __ret_218; \ + __ret_218 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_218), (uint16x4_t)(vrshrn_n_u32(__s1_218, __p2_218)))); \ + __ret_218; \ }) #else -#define vrshrn_high_n_u32(__p0_207, __p1_207, __p2_207) __extension__ ({ \ - uint16x4_t __s0_207 = __p0_207; \ - uint32x4_t __s1_207 = __p1_207; \ - uint16x4_t __rev0_207; __rev0_207 = __builtin_shufflevector(__s0_207, __s0_207, 3, 2, 1, 0); \ - uint32x4_t __rev1_207; __rev1_207 = __builtin_shufflevector(__s1_207, __s1_207, 3, 2, 1, 0); \ - uint16x8_t __ret_207; \ - __ret_207 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_207), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_207, __p2_207)))); \ - __ret_207 = __builtin_shufflevector(__ret_207, __ret_207, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_207; \ +#define vrshrn_high_n_u32(__p0_219, __p1_219, __p2_219) __extension__ ({ \ + uint16x4_t __s0_219 = __p0_219; \ + uint32x4_t __s1_219 = __p1_219; \ + uint16x4_t __rev0_219; __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 3, 2, 1, 0); \ + uint32x4_t __rev1_219; __rev1_219 = __builtin_shufflevector(__s1_219, __s1_219, 3, 2, 1, 0); \ + uint16x8_t __ret_219; \ + __ret_219 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_219), (uint16x4_t)(__noswap_vrshrn_n_u32(__rev1_219, __p2_219)))); \ + __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_219; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_u64(__p0_208, __p1_208, __p2_208) __extension__ ({ \ - uint32x2_t __s0_208 = __p0_208; \ - uint64x2_t __s1_208 = __p1_208; \ - uint32x4_t __ret_208; \ - __ret_208 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_208), (uint32x2_t)(vrshrn_n_u64(__s1_208, __p2_208)))); \ - __ret_208; \ +#define vrshrn_high_n_u64(__p0_220, __p1_220, __p2_220) __extension__ ({ \ + uint32x2_t __s0_220 = __p0_220; \ + uint64x2_t __s1_220 = __p1_220; \ + uint32x4_t __ret_220; \ + __ret_220 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_220), (uint32x2_t)(vrshrn_n_u64(__s1_220, __p2_220)))); \ + __ret_220; \ }) #else -#define vrshrn_high_n_u64(__p0_209, __p1_209, __p2_209) __extension__ ({ \ - uint32x2_t __s0_209 = __p0_209; \ - uint64x2_t __s1_209 = __p1_209; \ - uint32x2_t __rev0_209; __rev0_209 = __builtin_shufflevector(__s0_209, __s0_209, 1, 0); \ - uint64x2_t __rev1_209; __rev1_209 = __builtin_shufflevector(__s1_209, __s1_209, 1, 0); \ - uint32x4_t __ret_209; \ - __ret_209 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_209), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_209, __p2_209)))); \ - __ret_209 = __builtin_shufflevector(__ret_209, __ret_209, 3, 2, 1, 0); \ - __ret_209; \ +#define vrshrn_high_n_u64(__p0_221, __p1_221, __p2_221) __extension__ ({ \ + uint32x2_t __s0_221 = __p0_221; \ + uint64x2_t __s1_221 = __p1_221; \ + uint32x2_t __rev0_221; __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 1, 0); \ + uint64x2_t __rev1_221; __rev1_221 = __builtin_shufflevector(__s1_221, __s1_221, 1, 0); \ + uint32x4_t __ret_221; \ + __ret_221 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_221), (uint32x2_t)(__noswap_vrshrn_n_u64(__rev1_221, __p2_221)))); \ + __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 3, 2, 1, 0); \ + __ret_221; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_u16(__p0_210, __p1_210, __p2_210) __extension__ ({ \ - uint8x8_t __s0_210 = __p0_210; \ - uint16x8_t __s1_210 = __p1_210; \ - uint8x16_t __ret_210; \ - __ret_210 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_210), (uint8x8_t)(vrshrn_n_u16(__s1_210, __p2_210)))); \ - __ret_210; \ +#define vrshrn_high_n_u16(__p0_222, __p1_222, __p2_222) __extension__ ({ \ + uint8x8_t __s0_222 = __p0_222; \ + uint16x8_t __s1_222 = __p1_222; \ + uint8x16_t __ret_222; \ + __ret_222 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_222), (uint8x8_t)(vrshrn_n_u16(__s1_222, __p2_222)))); \ + __ret_222; \ }) #else -#define vrshrn_high_n_u16(__p0_211, __p1_211, __p2_211) __extension__ ({ \ - uint8x8_t __s0_211 = __p0_211; \ - uint16x8_t __s1_211 = __p1_211; \ - uint8x8_t __rev0_211; __rev0_211 = __builtin_shufflevector(__s0_211, __s0_211, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_211; __rev1_211 = __builtin_shufflevector(__s1_211, __s1_211, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_211; \ - __ret_211 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_211), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_211, __p2_211)))); \ - __ret_211 = __builtin_shufflevector(__ret_211, __ret_211, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_211; \ +#define vrshrn_high_n_u16(__p0_223, __p1_223, __p2_223) __extension__ ({ \ + uint8x8_t __s0_223 = __p0_223; \ + uint16x8_t __s1_223 = __p1_223; \ + uint8x8_t __rev0_223; __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_223; __rev1_223 = __builtin_shufflevector(__s1_223, __s1_223, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_223; \ + __ret_223 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_223), (uint8x8_t)(__noswap_vrshrn_n_u16(__rev1_223, __p2_223)))); \ + __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_223; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_s32(__p0_212, __p1_212, __p2_212) __extension__ ({ \ - int16x4_t __s0_212 = __p0_212; \ - int32x4_t __s1_212 = __p1_212; \ - int16x8_t __ret_212; \ - __ret_212 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_212), (int16x4_t)(vrshrn_n_s32(__s1_212, __p2_212)))); \ - __ret_212; \ +#define vrshrn_high_n_s32(__p0_224, __p1_224, __p2_224) __extension__ ({ \ + int16x4_t __s0_224 = __p0_224; \ + int32x4_t __s1_224 = __p1_224; \ + int16x8_t __ret_224; \ + __ret_224 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_224), (int16x4_t)(vrshrn_n_s32(__s1_224, __p2_224)))); \ + __ret_224; \ }) #else -#define vrshrn_high_n_s32(__p0_213, __p1_213, __p2_213) __extension__ ({ \ - int16x4_t __s0_213 = __p0_213; \ - int32x4_t __s1_213 = __p1_213; \ - int16x4_t __rev0_213; __rev0_213 = __builtin_shufflevector(__s0_213, __s0_213, 3, 2, 1, 0); \ - int32x4_t __rev1_213; __rev1_213 = __builtin_shufflevector(__s1_213, __s1_213, 3, 2, 1, 0); \ - int16x8_t __ret_213; \ - __ret_213 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_213), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_213, __p2_213)))); \ - __ret_213 = __builtin_shufflevector(__ret_213, __ret_213, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_213; \ +#define vrshrn_high_n_s32(__p0_225, __p1_225, __p2_225) __extension__ ({ \ + int16x4_t __s0_225 = __p0_225; \ + int32x4_t __s1_225 = __p1_225; \ + int16x4_t __rev0_225; __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 3, 2, 1, 0); \ + int32x4_t __rev1_225; __rev1_225 = __builtin_shufflevector(__s1_225, __s1_225, 3, 2, 1, 0); \ + int16x8_t __ret_225; \ + __ret_225 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_225), (int16x4_t)(__noswap_vrshrn_n_s32(__rev1_225, __p2_225)))); \ + __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_225; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_s64(__p0_214, __p1_214, __p2_214) __extension__ ({ \ - int32x2_t __s0_214 = __p0_214; \ - int64x2_t __s1_214 = __p1_214; \ - int32x4_t __ret_214; \ - __ret_214 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_214), (int32x2_t)(vrshrn_n_s64(__s1_214, __p2_214)))); \ - __ret_214; \ +#define vrshrn_high_n_s64(__p0_226, __p1_226, __p2_226) __extension__ ({ \ + int32x2_t __s0_226 = __p0_226; \ + int64x2_t __s1_226 = __p1_226; \ + int32x4_t __ret_226; \ + __ret_226 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_226), (int32x2_t)(vrshrn_n_s64(__s1_226, __p2_226)))); \ + __ret_226; \ }) #else -#define vrshrn_high_n_s64(__p0_215, __p1_215, __p2_215) __extension__ ({ \ - int32x2_t __s0_215 = __p0_215; \ - int64x2_t __s1_215 = __p1_215; \ - int32x2_t __rev0_215; __rev0_215 = __builtin_shufflevector(__s0_215, __s0_215, 1, 0); \ - int64x2_t __rev1_215; __rev1_215 = __builtin_shufflevector(__s1_215, __s1_215, 1, 0); \ - int32x4_t __ret_215; \ - __ret_215 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_215), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_215, __p2_215)))); \ - __ret_215 = __builtin_shufflevector(__ret_215, __ret_215, 3, 2, 1, 0); \ - __ret_215; \ +#define vrshrn_high_n_s64(__p0_227, __p1_227, __p2_227) __extension__ ({ \ + int32x2_t __s0_227 = __p0_227; \ + int64x2_t __s1_227 = __p1_227; \ + int32x2_t __rev0_227; __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 1, 0); \ + int64x2_t __rev1_227; __rev1_227 = __builtin_shufflevector(__s1_227, __s1_227, 1, 0); \ + int32x4_t __ret_227; \ + __ret_227 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_227), (int32x2_t)(__noswap_vrshrn_n_s64(__rev1_227, __p2_227)))); \ + __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 3, 2, 1, 0); \ + __ret_227; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vrshrn_high_n_s16(__p0_216, __p1_216, __p2_216) __extension__ ({ \ - int8x8_t __s0_216 = __p0_216; \ - int16x8_t __s1_216 = __p1_216; \ - int8x16_t __ret_216; \ - __ret_216 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_216), (int8x8_t)(vrshrn_n_s16(__s1_216, __p2_216)))); \ - __ret_216; \ +#define vrshrn_high_n_s16(__p0_228, __p1_228, __p2_228) __extension__ ({ \ + int8x8_t __s0_228 = __p0_228; \ + int16x8_t __s1_228 = __p1_228; \ + int8x16_t __ret_228; \ + __ret_228 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_228), (int8x8_t)(vrshrn_n_s16(__s1_228, __p2_228)))); \ + __ret_228; \ }) #else -#define vrshrn_high_n_s16(__p0_217, __p1_217, __p2_217) __extension__ ({ \ - int8x8_t __s0_217 = __p0_217; \ - int16x8_t __s1_217 = __p1_217; \ - int8x8_t __rev0_217; __rev0_217 = __builtin_shufflevector(__s0_217, __s0_217, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_217; __rev1_217 = __builtin_shufflevector(__s1_217, __s1_217, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_217; \ - __ret_217 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_217), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_217, __p2_217)))); \ - __ret_217 = __builtin_shufflevector(__ret_217, __ret_217, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_217; \ +#define vrshrn_high_n_s16(__p0_229, __p1_229, __p2_229) __extension__ ({ \ + int8x8_t __s0_229 = __p0_229; \ + int16x8_t __s1_229 = __p1_229; \ + int8x8_t __rev0_229; __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_229; __rev1_229 = __builtin_shufflevector(__s1_229, __s1_229, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_229; \ + __ret_229 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_229), (int8x8_t)(__noswap_vrshrn_n_s16(__rev1_229, __p2_229)))); \ + __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_229; \ }) #endif @@ -60816,110 +64128,110 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_u8(__p0_218, __p1_218) __extension__ ({ \ - uint8x16_t __s0_218 = __p0_218; \ - uint16x8_t __ret_218; \ - __ret_218 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_218), __p1_218)); \ - __ret_218; \ +#define vshll_high_n_u8(__p0_230, __p1_230) __extension__ ({ \ + uint8x16_t __s0_230 = __p0_230; \ + uint16x8_t __ret_230; \ + __ret_230 = (uint16x8_t)(vshll_n_u8(vget_high_u8(__s0_230), __p1_230)); \ + __ret_230; \ }) #else -#define vshll_high_n_u8(__p0_219, __p1_219) __extension__ ({ \ - uint8x16_t __s0_219 = __p0_219; \ - uint8x16_t __rev0_219; __rev0_219 = __builtin_shufflevector(__s0_219, __s0_219, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __ret_219; \ - __ret_219 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_219), __p1_219)); \ - __ret_219 = __builtin_shufflevector(__ret_219, __ret_219, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_219; \ +#define vshll_high_n_u8(__p0_231, __p1_231) __extension__ ({ \ + uint8x16_t __s0_231 = __p0_231; \ + uint8x16_t __rev0_231; __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __ret_231; \ + __ret_231 = (uint16x8_t)(__noswap_vshll_n_u8(__noswap_vget_high_u8(__rev0_231), __p1_231)); \ + __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_231; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_u32(__p0_220, __p1_220) __extension__ ({ \ - uint32x4_t __s0_220 = __p0_220; \ - uint64x2_t __ret_220; \ - __ret_220 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_220), __p1_220)); \ - __ret_220; \ +#define vshll_high_n_u32(__p0_232, __p1_232) __extension__ ({ \ + uint32x4_t __s0_232 = __p0_232; \ + uint64x2_t __ret_232; \ + __ret_232 = (uint64x2_t)(vshll_n_u32(vget_high_u32(__s0_232), __p1_232)); \ + __ret_232; \ }) #else -#define vshll_high_n_u32(__p0_221, __p1_221) __extension__ ({ \ - uint32x4_t __s0_221 = __p0_221; \ - uint32x4_t __rev0_221; __rev0_221 = __builtin_shufflevector(__s0_221, __s0_221, 3, 2, 1, 0); \ - uint64x2_t __ret_221; \ - __ret_221 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_221), __p1_221)); \ - __ret_221 = __builtin_shufflevector(__ret_221, __ret_221, 1, 0); \ - __ret_221; \ +#define vshll_high_n_u32(__p0_233, __p1_233) __extension__ ({ \ + uint32x4_t __s0_233 = __p0_233; \ + uint32x4_t __rev0_233; __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 3, 2, 1, 0); \ + uint64x2_t __ret_233; \ + __ret_233 = (uint64x2_t)(__noswap_vshll_n_u32(__noswap_vget_high_u32(__rev0_233), __p1_233)); \ + __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 1, 0); \ + __ret_233; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_u16(__p0_222, __p1_222) __extension__ ({ \ - uint16x8_t __s0_222 = __p0_222; \ - uint32x4_t __ret_222; \ - __ret_222 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_222), __p1_222)); \ - __ret_222; \ +#define vshll_high_n_u16(__p0_234, __p1_234) __extension__ ({ \ + uint16x8_t __s0_234 = __p0_234; \ + uint32x4_t __ret_234; \ + __ret_234 = (uint32x4_t)(vshll_n_u16(vget_high_u16(__s0_234), __p1_234)); \ + __ret_234; \ }) #else -#define vshll_high_n_u16(__p0_223, __p1_223) __extension__ ({ \ - uint16x8_t __s0_223 = __p0_223; \ - uint16x8_t __rev0_223; __rev0_223 = __builtin_shufflevector(__s0_223, __s0_223, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint32x4_t __ret_223; \ - __ret_223 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_223), __p1_223)); \ - __ret_223 = __builtin_shufflevector(__ret_223, __ret_223, 3, 2, 1, 0); \ - __ret_223; \ +#define vshll_high_n_u16(__p0_235, __p1_235) __extension__ ({ \ + uint16x8_t __s0_235 = __p0_235; \ + uint16x8_t __rev0_235; __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint32x4_t __ret_235; \ + __ret_235 = (uint32x4_t)(__noswap_vshll_n_u16(__noswap_vget_high_u16(__rev0_235), __p1_235)); \ + __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 3, 2, 1, 0); \ + __ret_235; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_s8(__p0_224, __p1_224) __extension__ ({ \ - int8x16_t __s0_224 = __p0_224; \ - int16x8_t __ret_224; \ - __ret_224 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_224), __p1_224)); \ - __ret_224; \ +#define vshll_high_n_s8(__p0_236, __p1_236) __extension__ ({ \ + int8x16_t __s0_236 = __p0_236; \ + int16x8_t __ret_236; \ + __ret_236 = (int16x8_t)(vshll_n_s8(vget_high_s8(__s0_236), __p1_236)); \ + __ret_236; \ }) #else -#define vshll_high_n_s8(__p0_225, __p1_225) __extension__ ({ \ - int8x16_t __s0_225 = __p0_225; \ - int8x16_t __rev0_225; __rev0_225 = __builtin_shufflevector(__s0_225, __s0_225, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __ret_225; \ - __ret_225 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_225), __p1_225)); \ - __ret_225 = __builtin_shufflevector(__ret_225, __ret_225, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_225; \ +#define vshll_high_n_s8(__p0_237, __p1_237) __extension__ ({ \ + int8x16_t __s0_237 = __p0_237; \ + int8x16_t __rev0_237; __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __ret_237; \ + __ret_237 = (int16x8_t)(__noswap_vshll_n_s8(__noswap_vget_high_s8(__rev0_237), __p1_237)); \ + __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_237; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_s32(__p0_226, __p1_226) __extension__ ({ \ - int32x4_t __s0_226 = __p0_226; \ - int64x2_t __ret_226; \ - __ret_226 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_226), __p1_226)); \ - __ret_226; \ +#define vshll_high_n_s32(__p0_238, __p1_238) __extension__ ({ \ + int32x4_t __s0_238 = __p0_238; \ + int64x2_t __ret_238; \ + __ret_238 = (int64x2_t)(vshll_n_s32(vget_high_s32(__s0_238), __p1_238)); \ + __ret_238; \ }) #else -#define vshll_high_n_s32(__p0_227, __p1_227) __extension__ ({ \ - int32x4_t __s0_227 = __p0_227; \ - int32x4_t __rev0_227; __rev0_227 = __builtin_shufflevector(__s0_227, __s0_227, 3, 2, 1, 0); \ - int64x2_t __ret_227; \ - __ret_227 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_227), __p1_227)); \ - __ret_227 = __builtin_shufflevector(__ret_227, __ret_227, 1, 0); \ - __ret_227; \ +#define vshll_high_n_s32(__p0_239, __p1_239) __extension__ ({ \ + int32x4_t __s0_239 = __p0_239; \ + int32x4_t __rev0_239; __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 3, 2, 1, 0); \ + int64x2_t __ret_239; \ + __ret_239 = (int64x2_t)(__noswap_vshll_n_s32(__noswap_vget_high_s32(__rev0_239), __p1_239)); \ + __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 1, 0); \ + __ret_239; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshll_high_n_s16(__p0_228, __p1_228) __extension__ ({ \ - int16x8_t __s0_228 = __p0_228; \ - int32x4_t __ret_228; \ - __ret_228 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_228), __p1_228)); \ - __ret_228; \ +#define vshll_high_n_s16(__p0_240, __p1_240) __extension__ ({ \ + int16x8_t __s0_240 = __p0_240; \ + int32x4_t __ret_240; \ + __ret_240 = (int32x4_t)(vshll_n_s16(vget_high_s16(__s0_240), __p1_240)); \ + __ret_240; \ }) #else -#define vshll_high_n_s16(__p0_229, __p1_229) __extension__ ({ \ - int16x8_t __s0_229 = __p0_229; \ - int16x8_t __rev0_229; __rev0_229 = __builtin_shufflevector(__s0_229, __s0_229, 7, 6, 5, 4, 3, 2, 1, 0); \ - int32x4_t __ret_229; \ - __ret_229 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_229), __p1_229)); \ - __ret_229 = __builtin_shufflevector(__ret_229, __ret_229, 3, 2, 1, 0); \ - __ret_229; \ +#define vshll_high_n_s16(__p0_241, __p1_241) __extension__ ({ \ + int16x8_t __s0_241 = __p0_241; \ + int16x8_t __rev0_241; __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32x4_t __ret_241; \ + __ret_241 = (int32x4_t)(__noswap_vshll_n_s16(__noswap_vget_high_s16(__rev0_241), __p1_241)); \ + __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 3, 2, 1, 0); \ + __ret_241; \ }) #endif @@ -60956,128 +64268,128 @@ __ai int64_t vshld_s64(int64_t __p0, int64_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_u32(__p0_230, __p1_230, __p2_230) __extension__ ({ \ - uint16x4_t __s0_230 = __p0_230; \ - uint32x4_t __s1_230 = __p1_230; \ - uint16x8_t __ret_230; \ - __ret_230 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_230), (uint16x4_t)(vshrn_n_u32(__s1_230, __p2_230)))); \ - __ret_230; \ +#define vshrn_high_n_u32(__p0_242, __p1_242, __p2_242) __extension__ ({ \ + uint16x4_t __s0_242 = __p0_242; \ + uint32x4_t __s1_242 = __p1_242; \ + uint16x8_t __ret_242; \ + __ret_242 = (uint16x8_t)(vcombine_u16((uint16x4_t)(__s0_242), (uint16x4_t)(vshrn_n_u32(__s1_242, __p2_242)))); \ + __ret_242; \ }) #else -#define vshrn_high_n_u32(__p0_231, __p1_231, __p2_231) __extension__ ({ \ - uint16x4_t __s0_231 = __p0_231; \ - uint32x4_t __s1_231 = __p1_231; \ - uint16x4_t __rev0_231; __rev0_231 = __builtin_shufflevector(__s0_231, __s0_231, 3, 2, 1, 0); \ - uint32x4_t __rev1_231; __rev1_231 = __builtin_shufflevector(__s1_231, __s1_231, 3, 2, 1, 0); \ - uint16x8_t __ret_231; \ - __ret_231 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_231), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_231, __p2_231)))); \ - __ret_231 = __builtin_shufflevector(__ret_231, __ret_231, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_231; \ +#define vshrn_high_n_u32(__p0_243, __p1_243, __p2_243) __extension__ ({ \ + uint16x4_t __s0_243 = __p0_243; \ + uint32x4_t __s1_243 = __p1_243; \ + uint16x4_t __rev0_243; __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \ + uint32x4_t __rev1_243; __rev1_243 = __builtin_shufflevector(__s1_243, __s1_243, 3, 2, 1, 0); \ + uint16x8_t __ret_243; \ + __ret_243 = (uint16x8_t)(__noswap_vcombine_u16((uint16x4_t)(__rev0_243), (uint16x4_t)(__noswap_vshrn_n_u32(__rev1_243, __p2_243)))); \ + __ret_243 = __builtin_shufflevector(__ret_243, __ret_243, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_243; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_u64(__p0_232, __p1_232, __p2_232) __extension__ ({ \ - uint32x2_t __s0_232 = __p0_232; \ - uint64x2_t __s1_232 = __p1_232; \ - uint32x4_t __ret_232; \ - __ret_232 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_232), (uint32x2_t)(vshrn_n_u64(__s1_232, __p2_232)))); \ - __ret_232; \ +#define vshrn_high_n_u64(__p0_244, __p1_244, __p2_244) __extension__ ({ \ + uint32x2_t __s0_244 = __p0_244; \ + uint64x2_t __s1_244 = __p1_244; \ + uint32x4_t __ret_244; \ + __ret_244 = (uint32x4_t)(vcombine_u32((uint32x2_t)(__s0_244), (uint32x2_t)(vshrn_n_u64(__s1_244, __p2_244)))); \ + __ret_244; \ }) #else -#define vshrn_high_n_u64(__p0_233, __p1_233, __p2_233) __extension__ ({ \ - uint32x2_t __s0_233 = __p0_233; \ - uint64x2_t __s1_233 = __p1_233; \ - uint32x2_t __rev0_233; __rev0_233 = __builtin_shufflevector(__s0_233, __s0_233, 1, 0); \ - uint64x2_t __rev1_233; __rev1_233 = __builtin_shufflevector(__s1_233, __s1_233, 1, 0); \ - uint32x4_t __ret_233; \ - __ret_233 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_233), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_233, __p2_233)))); \ - __ret_233 = __builtin_shufflevector(__ret_233, __ret_233, 3, 2, 1, 0); \ - __ret_233; \ +#define vshrn_high_n_u64(__p0_245, __p1_245, __p2_245) __extension__ ({ \ + uint32x2_t __s0_245 = __p0_245; \ + uint64x2_t __s1_245 = __p1_245; \ + uint32x2_t __rev0_245; __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 1, 0); \ + uint64x2_t __rev1_245; __rev1_245 = __builtin_shufflevector(__s1_245, __s1_245, 1, 0); \ + uint32x4_t __ret_245; \ + __ret_245 = (uint32x4_t)(__noswap_vcombine_u32((uint32x2_t)(__rev0_245), (uint32x2_t)(__noswap_vshrn_n_u64(__rev1_245, __p2_245)))); \ + __ret_245 = __builtin_shufflevector(__ret_245, __ret_245, 3, 2, 1, 0); \ + __ret_245; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_u16(__p0_234, __p1_234, __p2_234) __extension__ ({ \ - uint8x8_t __s0_234 = __p0_234; \ - uint16x8_t __s1_234 = __p1_234; \ - uint8x16_t __ret_234; \ - __ret_234 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_234), (uint8x8_t)(vshrn_n_u16(__s1_234, __p2_234)))); \ - __ret_234; \ +#define vshrn_high_n_u16(__p0_246, __p1_246, __p2_246) __extension__ ({ \ + uint8x8_t __s0_246 = __p0_246; \ + uint16x8_t __s1_246 = __p1_246; \ + uint8x16_t __ret_246; \ + __ret_246 = (uint8x16_t)(vcombine_u8((uint8x8_t)(__s0_246), (uint8x8_t)(vshrn_n_u16(__s1_246, __p2_246)))); \ + __ret_246; \ }) #else -#define vshrn_high_n_u16(__p0_235, __p1_235, __p2_235) __extension__ ({ \ - uint8x8_t __s0_235 = __p0_235; \ - uint16x8_t __s1_235 = __p1_235; \ - uint8x8_t __rev0_235; __rev0_235 = __builtin_shufflevector(__s0_235, __s0_235, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint16x8_t __rev1_235; __rev1_235 = __builtin_shufflevector(__s1_235, __s1_235, 7, 6, 5, 4, 3, 2, 1, 0); \ - uint8x16_t __ret_235; \ - __ret_235 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_235), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_235, __p2_235)))); \ - __ret_235 = __builtin_shufflevector(__ret_235, __ret_235, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_235; \ +#define vshrn_high_n_u16(__p0_247, __p1_247, __p2_247) __extension__ ({ \ + uint8x8_t __s0_247 = __p0_247; \ + uint16x8_t __s1_247 = __p1_247; \ + uint8x8_t __rev0_247; __rev0_247 = __builtin_shufflevector(__s0_247, __s0_247, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint16x8_t __rev1_247; __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __ret_247; \ + __ret_247 = (uint8x16_t)(__noswap_vcombine_u8((uint8x8_t)(__rev0_247), (uint8x8_t)(__noswap_vshrn_n_u16(__rev1_247, __p2_247)))); \ + __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_247; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_s32(__p0_236, __p1_236, __p2_236) __extension__ ({ \ - int16x4_t __s0_236 = __p0_236; \ - int32x4_t __s1_236 = __p1_236; \ - int16x8_t __ret_236; \ - __ret_236 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_236), (int16x4_t)(vshrn_n_s32(__s1_236, __p2_236)))); \ - __ret_236; \ +#define vshrn_high_n_s32(__p0_248, __p1_248, __p2_248) __extension__ ({ \ + int16x4_t __s0_248 = __p0_248; \ + int32x4_t __s1_248 = __p1_248; \ + int16x8_t __ret_248; \ + __ret_248 = (int16x8_t)(vcombine_s16((int16x4_t)(__s0_248), (int16x4_t)(vshrn_n_s32(__s1_248, __p2_248)))); \ + __ret_248; \ }) #else -#define vshrn_high_n_s32(__p0_237, __p1_237, __p2_237) __extension__ ({ \ - int16x4_t __s0_237 = __p0_237; \ - int32x4_t __s1_237 = __p1_237; \ - int16x4_t __rev0_237; __rev0_237 = __builtin_shufflevector(__s0_237, __s0_237, 3, 2, 1, 0); \ - int32x4_t __rev1_237; __rev1_237 = __builtin_shufflevector(__s1_237, __s1_237, 3, 2, 1, 0); \ - int16x8_t __ret_237; \ - __ret_237 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_237), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_237, __p2_237)))); \ - __ret_237 = __builtin_shufflevector(__ret_237, __ret_237, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_237; \ +#define vshrn_high_n_s32(__p0_249, __p1_249, __p2_249) __extension__ ({ \ + int16x4_t __s0_249 = __p0_249; \ + int32x4_t __s1_249 = __p1_249; \ + int16x4_t __rev0_249; __rev0_249 = __builtin_shufflevector(__s0_249, __s0_249, 3, 2, 1, 0); \ + int32x4_t __rev1_249; __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 3, 2, 1, 0); \ + int16x8_t __ret_249; \ + __ret_249 = (int16x8_t)(__noswap_vcombine_s16((int16x4_t)(__rev0_249), (int16x4_t)(__noswap_vshrn_n_s32(__rev1_249, __p2_249)))); \ + __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_249; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_s64(__p0_238, __p1_238, __p2_238) __extension__ ({ \ - int32x2_t __s0_238 = __p0_238; \ - int64x2_t __s1_238 = __p1_238; \ - int32x4_t __ret_238; \ - __ret_238 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_238), (int32x2_t)(vshrn_n_s64(__s1_238, __p2_238)))); \ - __ret_238; \ +#define vshrn_high_n_s64(__p0_250, __p1_250, __p2_250) __extension__ ({ \ + int32x2_t __s0_250 = __p0_250; \ + int64x2_t __s1_250 = __p1_250; \ + int32x4_t __ret_250; \ + __ret_250 = (int32x4_t)(vcombine_s32((int32x2_t)(__s0_250), (int32x2_t)(vshrn_n_s64(__s1_250, __p2_250)))); \ + __ret_250; \ }) #else -#define vshrn_high_n_s64(__p0_239, __p1_239, __p2_239) __extension__ ({ \ - int32x2_t __s0_239 = __p0_239; \ - int64x2_t __s1_239 = __p1_239; \ - int32x2_t __rev0_239; __rev0_239 = __builtin_shufflevector(__s0_239, __s0_239, 1, 0); \ - int64x2_t __rev1_239; __rev1_239 = __builtin_shufflevector(__s1_239, __s1_239, 1, 0); \ - int32x4_t __ret_239; \ - __ret_239 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_239), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_239, __p2_239)))); \ - __ret_239 = __builtin_shufflevector(__ret_239, __ret_239, 3, 2, 1, 0); \ - __ret_239; \ +#define vshrn_high_n_s64(__p0_251, __p1_251, __p2_251) __extension__ ({ \ + int32x2_t __s0_251 = __p0_251; \ + int64x2_t __s1_251 = __p1_251; \ + int32x2_t __rev0_251; __rev0_251 = __builtin_shufflevector(__s0_251, __s0_251, 1, 0); \ + int64x2_t __rev1_251; __rev1_251 = __builtin_shufflevector(__s1_251, __s1_251, 1, 0); \ + int32x4_t __ret_251; \ + __ret_251 = (int32x4_t)(__noswap_vcombine_s32((int32x2_t)(__rev0_251), (int32x2_t)(__noswap_vshrn_n_s64(__rev1_251, __p2_251)))); \ + __ret_251 = __builtin_shufflevector(__ret_251, __ret_251, 3, 2, 1, 0); \ + __ret_251; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vshrn_high_n_s16(__p0_240, __p1_240, __p2_240) __extension__ ({ \ - int8x8_t __s0_240 = __p0_240; \ - int16x8_t __s1_240 = __p1_240; \ - int8x16_t __ret_240; \ - __ret_240 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_240), (int8x8_t)(vshrn_n_s16(__s1_240, __p2_240)))); \ - __ret_240; \ +#define vshrn_high_n_s16(__p0_252, __p1_252, __p2_252) __extension__ ({ \ + int8x8_t __s0_252 = __p0_252; \ + int16x8_t __s1_252 = __p1_252; \ + int8x16_t __ret_252; \ + __ret_252 = (int8x16_t)(vcombine_s8((int8x8_t)(__s0_252), (int8x8_t)(vshrn_n_s16(__s1_252, __p2_252)))); \ + __ret_252; \ }) #else -#define vshrn_high_n_s16(__p0_241, __p1_241, __p2_241) __extension__ ({ \ - int8x8_t __s0_241 = __p0_241; \ - int16x8_t __s1_241 = __p1_241; \ - int8x8_t __rev0_241; __rev0_241 = __builtin_shufflevector(__s0_241, __s0_241, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16x8_t __rev1_241; __rev1_241 = __builtin_shufflevector(__s1_241, __s1_241, 7, 6, 5, 4, 3, 2, 1, 0); \ - int8x16_t __ret_241; \ - __ret_241 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_241), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_241, __p2_241)))); \ - __ret_241 = __builtin_shufflevector(__ret_241, __ret_241, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_241; \ +#define vshrn_high_n_s16(__p0_253, __p1_253, __p2_253) __extension__ ({ \ + int8x8_t __s0_253 = __p0_253; \ + int16x8_t __s1_253 = __p1_253; \ + int8x8_t __rev0_253; __rev0_253 = __builtin_shufflevector(__s0_253, __s0_253, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16x8_t __rev1_253; __rev1_253 = __builtin_shufflevector(__s1_253, __s1_253, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __ret_253; \ + __ret_253 = (int8x16_t)(__noswap_vcombine_s8((int8x8_t)(__rev0_253), (int8x8_t)(__noswap_vshrn_n_s16(__rev1_253, __p2_253)))); \ + __ret_253 = __builtin_shufflevector(__ret_253, __ret_253, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_253; \ }) #endif @@ -67149,44 +70461,60 @@ __ai int32x4_t vaddw_s16(int32x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vget_lane_f16(__p0_242, __p1_242) __extension__ ({ \ - float16x4_t __s0_242 = __p0_242; \ - float16_t __ret_242; \ -float16x4_t __reint_242 = __s0_242; \ -int16_t __reint1_242 = vget_lane_s16(*(int16x4_t *) &__reint_242, __p1_242); \ - __ret_242 = *(float16_t *) &__reint1_242; \ - __ret_242; \ +#define vget_lane_f16(__p0_254, __p1_254) __extension__ ({ \ + float16x4_t __s0_254 = __p0_254; \ + float16_t __ret_254; \ +float16x4_t __reint_254 = __s0_254; \ +int16_t __reint1_254 = vget_lane_s16(*(int16x4_t *) &__reint_254, __p1_254); \ + __ret_254 = *(float16_t *) &__reint1_254; \ + __ret_254; \ }) #else -#define vget_lane_f16(__p0_243, __p1_243) __extension__ ({ \ - float16x4_t __s0_243 = __p0_243; \ - float16x4_t __rev0_243; __rev0_243 = __builtin_shufflevector(__s0_243, __s0_243, 3, 2, 1, 0); \ - float16_t __ret_243; \ -float16x4_t __reint_243 = __rev0_243; \ -int16_t __reint1_243 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_243, __p1_243); \ - __ret_243 = *(float16_t *) &__reint1_243; \ - __ret_243; \ +#define vget_lane_f16(__p0_255, __p1_255) __extension__ ({ \ + float16x4_t __s0_255 = __p0_255; \ + float16x4_t __rev0_255; __rev0_255 = __builtin_shufflevector(__s0_255, __s0_255, 3, 2, 1, 0); \ + float16_t __ret_255; \ +float16x4_t __reint_255 = __rev0_255; \ +int16_t __reint1_255 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_255, __p1_255); \ + __ret_255 = *(float16_t *) &__reint1_255; \ + __ret_255; \ +}) +#define __noswap_vget_lane_f16(__p0_256, __p1_256) __extension__ ({ \ + float16x4_t __s0_256 = __p0_256; \ + float16_t __ret_256; \ +float16x4_t __reint_256 = __s0_256; \ +int16_t __reint1_256 = __noswap_vget_lane_s16(*(int16x4_t *) &__reint_256, __p1_256); \ + __ret_256 = *(float16_t *) &__reint1_256; \ + __ret_256; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vgetq_lane_f16(__p0_244, __p1_244) __extension__ ({ \ - float16x8_t __s0_244 = __p0_244; \ - float16_t __ret_244; \ -float16x8_t __reint_244 = __s0_244; \ -int16_t __reint1_244 = vgetq_lane_s16(*(int16x8_t *) &__reint_244, __p1_244); \ - __ret_244 = *(float16_t *) &__reint1_244; \ - __ret_244; \ +#define vgetq_lane_f16(__p0_257, __p1_257) __extension__ ({ \ + float16x8_t __s0_257 = __p0_257; \ + float16_t __ret_257; \ +float16x8_t __reint_257 = __s0_257; \ +int16_t __reint1_257 = vgetq_lane_s16(*(int16x8_t *) &__reint_257, __p1_257); \ + __ret_257 = *(float16_t *) &__reint1_257; \ + __ret_257; \ }) #else -#define vgetq_lane_f16(__p0_245, __p1_245) __extension__ ({ \ - float16x8_t __s0_245 = __p0_245; \ - float16x8_t __rev0_245; __rev0_245 = __builtin_shufflevector(__s0_245, __s0_245, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16_t __ret_245; \ -float16x8_t __reint_245 = __rev0_245; \ -int16_t __reint1_245 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_245, __p1_245); \ - __ret_245 = *(float16_t *) &__reint1_245; \ - __ret_245; \ +#define vgetq_lane_f16(__p0_258, __p1_258) __extension__ ({ \ + float16x8_t __s0_258 = __p0_258; \ + float16x8_t __rev0_258; __rev0_258 = __builtin_shufflevector(__s0_258, __s0_258, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret_258; \ +float16x8_t __reint_258 = __rev0_258; \ +int16_t __reint1_258 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_258, __p1_258); \ + __ret_258 = *(float16_t *) &__reint1_258; \ + __ret_258; \ +}) +#define __noswap_vgetq_lane_f16(__p0_259, __p1_259) __extension__ ({ \ + float16x8_t __s0_259 = __p0_259; \ + float16_t __ret_259; \ +float16x8_t __reint_259 = __s0_259; \ +int16_t __reint1_259 = __noswap_vgetq_lane_s16(*(int16x8_t *) &__reint_259, __p1_259); \ + __ret_259 = *(float16_t *) &__reint1_259; \ + __ret_259; \ }) #endif @@ -67835,57 +71163,97 @@ __ai int32x4_t __noswap_vmlsl_n_s16(int32x4_t __p0, int16x4_t __p1, int16_t __p2 #endif #ifdef __LITTLE_ENDIAN__ -#define vset_lane_f16(__p0_246, __p1_246, __p2_246) __extension__ ({ \ - float16_t __s0_246 = __p0_246; \ - float16x4_t __s1_246 = __p1_246; \ - float16x4_t __ret_246; \ -float16_t __reint_246 = __s0_246; \ -float16x4_t __reint1_246 = __s1_246; \ -int16x4_t __reint2_246 = vset_lane_s16(*(int16_t *) &__reint_246, *(int16x4_t *) &__reint1_246, __p2_246); \ - __ret_246 = *(float16x4_t *) &__reint2_246; \ - __ret_246; \ +#define vset_lane_f16(__p0_260, __p1_260, __p2_260) __extension__ ({ \ + float16_t __s0_260 = __p0_260; \ + float16x4_t __s1_260 = __p1_260; \ + float16x4_t __ret_260; \ +float16_t __reint_260 = __s0_260; \ +float16x4_t __reint1_260 = __s1_260; \ +int16x4_t __reint2_260 = vset_lane_s16(*(int16_t *) &__reint_260, *(int16x4_t *) &__reint1_260, __p2_260); \ + __ret_260 = *(float16x4_t *) &__reint2_260; \ + __ret_260; \ }) #else -#define vset_lane_f16(__p0_247, __p1_247, __p2_247) __extension__ ({ \ - float16_t __s0_247 = __p0_247; \ - float16x4_t __s1_247 = __p1_247; \ - float16x4_t __rev1_247; __rev1_247 = __builtin_shufflevector(__s1_247, __s1_247, 3, 2, 1, 0); \ - float16x4_t __ret_247; \ -float16_t __reint_247 = __s0_247; \ -float16x4_t __reint1_247 = __rev1_247; \ -int16x4_t __reint2_247 = __noswap_vset_lane_s16(*(int16_t *) &__reint_247, *(int16x4_t *) &__reint1_247, __p2_247); \ - __ret_247 = *(float16x4_t *) &__reint2_247; \ - __ret_247 = __builtin_shufflevector(__ret_247, __ret_247, 3, 2, 1, 0); \ - __ret_247; \ +#define vset_lane_f16(__p0_261, __p1_261, __p2_261) __extension__ ({ \ + float16_t __s0_261 = __p0_261; \ + float16x4_t __s1_261 = __p1_261; \ + float16x4_t __rev1_261; __rev1_261 = __builtin_shufflevector(__s1_261, __s1_261, 3, 2, 1, 0); \ + float16x4_t __ret_261; \ +float16_t __reint_261 = __s0_261; \ +float16x4_t __reint1_261 = __rev1_261; \ +int16x4_t __reint2_261 = __noswap_vset_lane_s16(*(int16_t *) &__reint_261, *(int16x4_t *) &__reint1_261, __p2_261); \ + __ret_261 = *(float16x4_t *) &__reint2_261; \ + __ret_261 = __builtin_shufflevector(__ret_261, __ret_261, 3, 2, 1, 0); \ + __ret_261; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vsetq_lane_f16(__p0_248, __p1_248, __p2_248) __extension__ ({ \ - float16_t __s0_248 = __p0_248; \ - float16x8_t __s1_248 = __p1_248; \ - float16x8_t __ret_248; \ -float16_t __reint_248 = __s0_248; \ -float16x8_t __reint1_248 = __s1_248; \ -int16x8_t __reint2_248 = vsetq_lane_s16(*(int16_t *) &__reint_248, *(int16x8_t *) &__reint1_248, __p2_248); \ - __ret_248 = *(float16x8_t *) &__reint2_248; \ - __ret_248; \ +#define vsetq_lane_f16(__p0_262, __p1_262, __p2_262) __extension__ ({ \ + float16_t __s0_262 = __p0_262; \ + float16x8_t __s1_262 = __p1_262; \ + float16x8_t __ret_262; \ +float16_t __reint_262 = __s0_262; \ +float16x8_t __reint1_262 = __s1_262; \ +int16x8_t __reint2_262 = vsetq_lane_s16(*(int16_t *) &__reint_262, *(int16x8_t *) &__reint1_262, __p2_262); \ + __ret_262 = *(float16x8_t *) &__reint2_262; \ + __ret_262; \ }) #else -#define vsetq_lane_f16(__p0_249, __p1_249, __p2_249) __extension__ ({ \ - float16_t __s0_249 = __p0_249; \ - float16x8_t __s1_249 = __p1_249; \ - float16x8_t __rev1_249; __rev1_249 = __builtin_shufflevector(__s1_249, __s1_249, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __ret_249; \ -float16_t __reint_249 = __s0_249; \ -float16x8_t __reint1_249 = __rev1_249; \ -int16x8_t __reint2_249 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_249, *(int16x8_t *) &__reint1_249, __p2_249); \ - __ret_249 = *(float16x8_t *) &__reint2_249; \ - __ret_249 = __builtin_shufflevector(__ret_249, __ret_249, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret_249; \ +#define vsetq_lane_f16(__p0_263, __p1_263, __p2_263) __extension__ ({ \ + float16_t __s0_263 = __p0_263; \ + float16x8_t __s1_263 = __p1_263; \ + float16x8_t __rev1_263; __rev1_263 = __builtin_shufflevector(__s1_263, __s1_263, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret_263; \ +float16_t __reint_263 = __s0_263; \ +float16x8_t __reint1_263 = __rev1_263; \ +int16x8_t __reint2_263 = __noswap_vsetq_lane_s16(*(int16_t *) &__reint_263, *(int16x8_t *) &__reint1_263, __p2_263); \ + __ret_263 = *(float16x8_t *) &__reint2_263; \ + __ret_263 = __builtin_shufflevector(__ret_263, __ret_263, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret_263; \ }) #endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +#define vmulh_lane_f16(__p0_264, __p1_264, __p2_264) __extension__ ({ \ + float16_t __s0_264 = __p0_264; \ + float16x4_t __s1_264 = __p1_264; \ + float16_t __ret_264; \ + __ret_264 = __s0_264 * vget_lane_f16(__s1_264, __p2_264); \ + __ret_264; \ +}) +#else +#define vmulh_lane_f16(__p0_265, __p1_265, __p2_265) __extension__ ({ \ + float16_t __s0_265 = __p0_265; \ + float16x4_t __s1_265 = __p1_265; \ + float16x4_t __rev1_265; __rev1_265 = __builtin_shufflevector(__s1_265, __s1_265, 3, 2, 1, 0); \ + float16_t __ret_265; \ + __ret_265 = __s0_265 * __noswap_vget_lane_f16(__rev1_265, __p2_265); \ + __ret_265; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulh_laneq_f16(__p0_266, __p1_266, __p2_266) __extension__ ({ \ + float16_t __s0_266 = __p0_266; \ + float16x8_t __s1_266 = __p1_266; \ + float16_t __ret_266; \ + __ret_266 = __s0_266 * vgetq_lane_f16(__s1_266, __p2_266); \ + __ret_266; \ +}) +#else +#define vmulh_laneq_f16(__p0_267, __p1_267, __p2_267) __extension__ ({ \ + float16_t __s0_267 = __p0_267; \ + float16x8_t __s1_267 = __p1_267; \ + float16x8_t __rev1_267; __rev1_267 = __builtin_shufflevector(__s1_267, __s1_267, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret_267; \ + __ret_267 = __s0_267 * __noswap_vgetq_lane_f16(__rev1_267, __p2_267); \ + __ret_267; \ +}) +#endif + +#endif #if defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ __ai int32_t vqrdmlahs_s32(int32_t __p0, int32_t __p1, int32_t __p2) { @@ -67916,86 +71284,86 @@ __ai int16_t vqrdmlahh_s16(int16_t __p0, int16_t __p1, int16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahs_lane_s32(__p0_250, __p1_250, __p2_250, __p3_250) __extension__ ({ \ - int32_t __s0_250 = __p0_250; \ - int32_t __s1_250 = __p1_250; \ - int32x2_t __s2_250 = __p2_250; \ - int32_t __ret_250; \ - __ret_250 = vqadds_s32(__s0_250, vqrdmulhs_s32(__s1_250, vget_lane_s32(__s2_250, __p3_250))); \ - __ret_250; \ +#define vqrdmlahs_lane_s32(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \ + int32_t __s0_268 = __p0_268; \ + int32_t __s1_268 = __p1_268; \ + int32x2_t __s2_268 = __p2_268; \ + int32_t __ret_268; \ + __ret_268 = vqadds_s32(__s0_268, vqrdmulhs_s32(__s1_268, vget_lane_s32(__s2_268, __p3_268))); \ + __ret_268; \ }) #else -#define vqrdmlahs_lane_s32(__p0_251, __p1_251, __p2_251, __p3_251) __extension__ ({ \ - int32_t __s0_251 = __p0_251; \ - int32_t __s1_251 = __p1_251; \ - int32x2_t __s2_251 = __p2_251; \ - int32x2_t __rev2_251; __rev2_251 = __builtin_shufflevector(__s2_251, __s2_251, 1, 0); \ - int32_t __ret_251; \ - __ret_251 = __noswap_vqadds_s32(__s0_251, __noswap_vqrdmulhs_s32(__s1_251, __noswap_vget_lane_s32(__rev2_251, __p3_251))); \ - __ret_251; \ +#define vqrdmlahs_lane_s32(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \ + int32_t __s0_269 = __p0_269; \ + int32_t __s1_269 = __p1_269; \ + int32x2_t __s2_269 = __p2_269; \ + int32x2_t __rev2_269; __rev2_269 = __builtin_shufflevector(__s2_269, __s2_269, 1, 0); \ + int32_t __ret_269; \ + __ret_269 = __noswap_vqadds_s32(__s0_269, __noswap_vqrdmulhs_s32(__s1_269, __noswap_vget_lane_s32(__rev2_269, __p3_269))); \ + __ret_269; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahh_lane_s16(__p0_252, __p1_252, __p2_252, __p3_252) __extension__ ({ \ - int16_t __s0_252 = __p0_252; \ - int16_t __s1_252 = __p1_252; \ - int16x4_t __s2_252 = __p2_252; \ - int16_t __ret_252; \ - __ret_252 = vqaddh_s16(__s0_252, vqrdmulhh_s16(__s1_252, vget_lane_s16(__s2_252, __p3_252))); \ - __ret_252; \ +#define vqrdmlahh_lane_s16(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \ + int16_t __s0_270 = __p0_270; \ + int16_t __s1_270 = __p1_270; \ + int16x4_t __s2_270 = __p2_270; \ + int16_t __ret_270; \ + __ret_270 = vqaddh_s16(__s0_270, vqrdmulhh_s16(__s1_270, vget_lane_s16(__s2_270, __p3_270))); \ + __ret_270; \ }) #else -#define vqrdmlahh_lane_s16(__p0_253, __p1_253, __p2_253, __p3_253) __extension__ ({ \ - int16_t __s0_253 = __p0_253; \ - int16_t __s1_253 = __p1_253; \ - int16x4_t __s2_253 = __p2_253; \ - int16x4_t __rev2_253; __rev2_253 = __builtin_shufflevector(__s2_253, __s2_253, 3, 2, 1, 0); \ - int16_t __ret_253; \ - __ret_253 = __noswap_vqaddh_s16(__s0_253, __noswap_vqrdmulhh_s16(__s1_253, __noswap_vget_lane_s16(__rev2_253, __p3_253))); \ - __ret_253; \ +#define vqrdmlahh_lane_s16(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \ + int16_t __s0_271 = __p0_271; \ + int16_t __s1_271 = __p1_271; \ + int16x4_t __s2_271 = __p2_271; \ + int16x4_t __rev2_271; __rev2_271 = __builtin_shufflevector(__s2_271, __s2_271, 3, 2, 1, 0); \ + int16_t __ret_271; \ + __ret_271 = __noswap_vqaddh_s16(__s0_271, __noswap_vqrdmulhh_s16(__s1_271, __noswap_vget_lane_s16(__rev2_271, __p3_271))); \ + __ret_271; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahs_laneq_s32(__p0_254, __p1_254, __p2_254, __p3_254) __extension__ ({ \ - int32_t __s0_254 = __p0_254; \ - int32_t __s1_254 = __p1_254; \ - int32x4_t __s2_254 = __p2_254; \ - int32_t __ret_254; \ - __ret_254 = vqadds_s32(__s0_254, vqrdmulhs_s32(__s1_254, vgetq_lane_s32(__s2_254, __p3_254))); \ - __ret_254; \ +#define vqrdmlahs_laneq_s32(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \ + int32_t __s0_272 = __p0_272; \ + int32_t __s1_272 = __p1_272; \ + int32x4_t __s2_272 = __p2_272; \ + int32_t __ret_272; \ + __ret_272 = vqadds_s32(__s0_272, vqrdmulhs_s32(__s1_272, vgetq_lane_s32(__s2_272, __p3_272))); \ + __ret_272; \ }) #else -#define vqrdmlahs_laneq_s32(__p0_255, __p1_255, __p2_255, __p3_255) __extension__ ({ \ - int32_t __s0_255 = __p0_255; \ - int32_t __s1_255 = __p1_255; \ - int32x4_t __s2_255 = __p2_255; \ - int32x4_t __rev2_255; __rev2_255 = __builtin_shufflevector(__s2_255, __s2_255, 3, 2, 1, 0); \ - int32_t __ret_255; \ - __ret_255 = __noswap_vqadds_s32(__s0_255, __noswap_vqrdmulhs_s32(__s1_255, __noswap_vgetq_lane_s32(__rev2_255, __p3_255))); \ - __ret_255; \ +#define vqrdmlahs_laneq_s32(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \ + int32_t __s0_273 = __p0_273; \ + int32_t __s1_273 = __p1_273; \ + int32x4_t __s2_273 = __p2_273; \ + int32x4_t __rev2_273; __rev2_273 = __builtin_shufflevector(__s2_273, __s2_273, 3, 2, 1, 0); \ + int32_t __ret_273; \ + __ret_273 = __noswap_vqadds_s32(__s0_273, __noswap_vqrdmulhs_s32(__s1_273, __noswap_vgetq_lane_s32(__rev2_273, __p3_273))); \ + __ret_273; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlahh_laneq_s16(__p0_256, __p1_256, __p2_256, __p3_256) __extension__ ({ \ - int16_t __s0_256 = __p0_256; \ - int16_t __s1_256 = __p1_256; \ - int16x8_t __s2_256 = __p2_256; \ - int16_t __ret_256; \ - __ret_256 = vqaddh_s16(__s0_256, vqrdmulhh_s16(__s1_256, vgetq_lane_s16(__s2_256, __p3_256))); \ - __ret_256; \ +#define vqrdmlahh_laneq_s16(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \ + int16_t __s0_274 = __p0_274; \ + int16_t __s1_274 = __p1_274; \ + int16x8_t __s2_274 = __p2_274; \ + int16_t __ret_274; \ + __ret_274 = vqaddh_s16(__s0_274, vqrdmulhh_s16(__s1_274, vgetq_lane_s16(__s2_274, __p3_274))); \ + __ret_274; \ }) #else -#define vqrdmlahh_laneq_s16(__p0_257, __p1_257, __p2_257, __p3_257) __extension__ ({ \ - int16_t __s0_257 = __p0_257; \ - int16_t __s1_257 = __p1_257; \ - int16x8_t __s2_257 = __p2_257; \ - int16x8_t __rev2_257; __rev2_257 = __builtin_shufflevector(__s2_257, __s2_257, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_257; \ - __ret_257 = __noswap_vqaddh_s16(__s0_257, __noswap_vqrdmulhh_s16(__s1_257, __noswap_vgetq_lane_s16(__rev2_257, __p3_257))); \ - __ret_257; \ +#define vqrdmlahh_laneq_s16(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \ + int16_t __s0_275 = __p0_275; \ + int16_t __s1_275 = __p1_275; \ + int16x8_t __s2_275 = __p2_275; \ + int16x8_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_275; \ + __ret_275 = __noswap_vqaddh_s16(__s0_275, __noswap_vqrdmulhh_s16(__s1_275, __noswap_vgetq_lane_s16(__rev2_275, __p3_275))); \ + __ret_275; \ }) #endif @@ -68028,86 +71396,86 @@ __ai int16_t vqrdmlshh_s16(int16_t __p0, int16_t __p1, int16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshs_lane_s32(__p0_258, __p1_258, __p2_258, __p3_258) __extension__ ({ \ - int32_t __s0_258 = __p0_258; \ - int32_t __s1_258 = __p1_258; \ - int32x2_t __s2_258 = __p2_258; \ - int32_t __ret_258; \ - __ret_258 = vqsubs_s32(__s0_258, vqrdmulhs_s32(__s1_258, vget_lane_s32(__s2_258, __p3_258))); \ - __ret_258; \ +#define vqrdmlshs_lane_s32(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \ + int32_t __s0_276 = __p0_276; \ + int32_t __s1_276 = __p1_276; \ + int32x2_t __s2_276 = __p2_276; \ + int32_t __ret_276; \ + __ret_276 = vqsubs_s32(__s0_276, vqrdmulhs_s32(__s1_276, vget_lane_s32(__s2_276, __p3_276))); \ + __ret_276; \ }) #else -#define vqrdmlshs_lane_s32(__p0_259, __p1_259, __p2_259, __p3_259) __extension__ ({ \ - int32_t __s0_259 = __p0_259; \ - int32_t __s1_259 = __p1_259; \ - int32x2_t __s2_259 = __p2_259; \ - int32x2_t __rev2_259; __rev2_259 = __builtin_shufflevector(__s2_259, __s2_259, 1, 0); \ - int32_t __ret_259; \ - __ret_259 = __noswap_vqsubs_s32(__s0_259, __noswap_vqrdmulhs_s32(__s1_259, __noswap_vget_lane_s32(__rev2_259, __p3_259))); \ - __ret_259; \ +#define vqrdmlshs_lane_s32(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \ + int32_t __s0_277 = __p0_277; \ + int32_t __s1_277 = __p1_277; \ + int32x2_t __s2_277 = __p2_277; \ + int32x2_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \ + int32_t __ret_277; \ + __ret_277 = __noswap_vqsubs_s32(__s0_277, __noswap_vqrdmulhs_s32(__s1_277, __noswap_vget_lane_s32(__rev2_277, __p3_277))); \ + __ret_277; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshh_lane_s16(__p0_260, __p1_260, __p2_260, __p3_260) __extension__ ({ \ - int16_t __s0_260 = __p0_260; \ - int16_t __s1_260 = __p1_260; \ - int16x4_t __s2_260 = __p2_260; \ - int16_t __ret_260; \ - __ret_260 = vqsubh_s16(__s0_260, vqrdmulhh_s16(__s1_260, vget_lane_s16(__s2_260, __p3_260))); \ - __ret_260; \ +#define vqrdmlshh_lane_s16(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \ + int16_t __s0_278 = __p0_278; \ + int16_t __s1_278 = __p1_278; \ + int16x4_t __s2_278 = __p2_278; \ + int16_t __ret_278; \ + __ret_278 = vqsubh_s16(__s0_278, vqrdmulhh_s16(__s1_278, vget_lane_s16(__s2_278, __p3_278))); \ + __ret_278; \ }) #else -#define vqrdmlshh_lane_s16(__p0_261, __p1_261, __p2_261, __p3_261) __extension__ ({ \ - int16_t __s0_261 = __p0_261; \ - int16_t __s1_261 = __p1_261; \ - int16x4_t __s2_261 = __p2_261; \ - int16x4_t __rev2_261; __rev2_261 = __builtin_shufflevector(__s2_261, __s2_261, 3, 2, 1, 0); \ - int16_t __ret_261; \ - __ret_261 = __noswap_vqsubh_s16(__s0_261, __noswap_vqrdmulhh_s16(__s1_261, __noswap_vget_lane_s16(__rev2_261, __p3_261))); \ - __ret_261; \ +#define vqrdmlshh_lane_s16(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \ + int16_t __s0_279 = __p0_279; \ + int16_t __s1_279 = __p1_279; \ + int16x4_t __s2_279 = __p2_279; \ + int16x4_t __rev2_279; __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 3, 2, 1, 0); \ + int16_t __ret_279; \ + __ret_279 = __noswap_vqsubh_s16(__s0_279, __noswap_vqrdmulhh_s16(__s1_279, __noswap_vget_lane_s16(__rev2_279, __p3_279))); \ + __ret_279; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshs_laneq_s32(__p0_262, __p1_262, __p2_262, __p3_262) __extension__ ({ \ - int32_t __s0_262 = __p0_262; \ - int32_t __s1_262 = __p1_262; \ - int32x4_t __s2_262 = __p2_262; \ - int32_t __ret_262; \ - __ret_262 = vqsubs_s32(__s0_262, vqrdmulhs_s32(__s1_262, vgetq_lane_s32(__s2_262, __p3_262))); \ - __ret_262; \ +#define vqrdmlshs_laneq_s32(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \ + int32_t __s0_280 = __p0_280; \ + int32_t __s1_280 = __p1_280; \ + int32x4_t __s2_280 = __p2_280; \ + int32_t __ret_280; \ + __ret_280 = vqsubs_s32(__s0_280, vqrdmulhs_s32(__s1_280, vgetq_lane_s32(__s2_280, __p3_280))); \ + __ret_280; \ }) #else -#define vqrdmlshs_laneq_s32(__p0_263, __p1_263, __p2_263, __p3_263) __extension__ ({ \ - int32_t __s0_263 = __p0_263; \ - int32_t __s1_263 = __p1_263; \ - int32x4_t __s2_263 = __p2_263; \ - int32x4_t __rev2_263; __rev2_263 = __builtin_shufflevector(__s2_263, __s2_263, 3, 2, 1, 0); \ - int32_t __ret_263; \ - __ret_263 = __noswap_vqsubs_s32(__s0_263, __noswap_vqrdmulhs_s32(__s1_263, __noswap_vgetq_lane_s32(__rev2_263, __p3_263))); \ - __ret_263; \ +#define vqrdmlshs_laneq_s32(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \ + int32_t __s0_281 = __p0_281; \ + int32_t __s1_281 = __p1_281; \ + int32x4_t __s2_281 = __p2_281; \ + int32x4_t __rev2_281; __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 3, 2, 1, 0); \ + int32_t __ret_281; \ + __ret_281 = __noswap_vqsubs_s32(__s0_281, __noswap_vqrdmulhs_s32(__s1_281, __noswap_vgetq_lane_s32(__rev2_281, __p3_281))); \ + __ret_281; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vqrdmlshh_laneq_s16(__p0_264, __p1_264, __p2_264, __p3_264) __extension__ ({ \ - int16_t __s0_264 = __p0_264; \ - int16_t __s1_264 = __p1_264; \ - int16x8_t __s2_264 = __p2_264; \ - int16_t __ret_264; \ - __ret_264 = vqsubh_s16(__s0_264, vqrdmulhh_s16(__s1_264, vgetq_lane_s16(__s2_264, __p3_264))); \ - __ret_264; \ +#define vqrdmlshh_laneq_s16(__p0_282, __p1_282, __p2_282, __p3_282) __extension__ ({ \ + int16_t __s0_282 = __p0_282; \ + int16_t __s1_282 = __p1_282; \ + int16x8_t __s2_282 = __p2_282; \ + int16_t __ret_282; \ + __ret_282 = vqsubh_s16(__s0_282, vqrdmulhh_s16(__s1_282, vgetq_lane_s16(__s2_282, __p3_282))); \ + __ret_282; \ }) #else -#define vqrdmlshh_laneq_s16(__p0_265, __p1_265, __p2_265, __p3_265) __extension__ ({ \ - int16_t __s0_265 = __p0_265; \ - int16_t __s1_265 = __p1_265; \ - int16x8_t __s2_265 = __p2_265; \ - int16x8_t __rev2_265; __rev2_265 = __builtin_shufflevector(__s2_265, __s2_265, 7, 6, 5, 4, 3, 2, 1, 0); \ - int16_t __ret_265; \ - __ret_265 = __noswap_vqsubh_s16(__s0_265, __noswap_vqrdmulhh_s16(__s1_265, __noswap_vgetq_lane_s16(__rev2_265, __p3_265))); \ - __ret_265; \ +#define vqrdmlshh_laneq_s16(__p0_283, __p1_283, __p2_283, __p3_283) __extension__ ({ \ + int16_t __s0_283 = __p0_283; \ + int16_t __s1_283 = __p1_283; \ + int16x8_t __s2_283 = __p2_283; \ + int16x8_t __rev2_283; __rev2_283 = __builtin_shufflevector(__s2_283, __s2_283, 7, 6, 5, 4, 3, 2, 1, 0); \ + int16_t __ret_283; \ + __ret_283 = __noswap_vqsubh_s16(__s0_283, __noswap_vqrdmulhh_s16(__s1_283, __noswap_vgetq_lane_s16(__rev2_283, __p3_283))); \ + __ret_283; \ }) #endif @@ -68420,158 +71788,158 @@ __ai int32x4_t vaddw_high_s16(int32x4_t __p0, int16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_p64(__p0_266, __p1_266, __p2_266, __p3_266) __extension__ ({ \ - poly64x2_t __s0_266 = __p0_266; \ - poly64x1_t __s2_266 = __p2_266; \ - poly64x2_t __ret_266; \ - __ret_266 = vsetq_lane_p64(vget_lane_p64(__s2_266, __p3_266), __s0_266, __p1_266); \ - __ret_266; \ +#define vcopyq_lane_p64(__p0_284, __p1_284, __p2_284, __p3_284) __extension__ ({ \ + poly64x2_t __s0_284 = __p0_284; \ + poly64x1_t __s2_284 = __p2_284; \ + poly64x2_t __ret_284; \ + __ret_284 = vsetq_lane_p64(vget_lane_p64(__s2_284, __p3_284), __s0_284, __p1_284); \ + __ret_284; \ }) #else -#define vcopyq_lane_p64(__p0_267, __p1_267, __p2_267, __p3_267) __extension__ ({ \ - poly64x2_t __s0_267 = __p0_267; \ - poly64x1_t __s2_267 = __p2_267; \ - poly64x2_t __rev0_267; __rev0_267 = __builtin_shufflevector(__s0_267, __s0_267, 1, 0); \ - poly64x2_t __ret_267; \ - __ret_267 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_267, __p3_267), __rev0_267, __p1_267); \ - __ret_267 = __builtin_shufflevector(__ret_267, __ret_267, 1, 0); \ - __ret_267; \ +#define vcopyq_lane_p64(__p0_285, __p1_285, __p2_285, __p3_285) __extension__ ({ \ + poly64x2_t __s0_285 = __p0_285; \ + poly64x1_t __s2_285 = __p2_285; \ + poly64x2_t __rev0_285; __rev0_285 = __builtin_shufflevector(__s0_285, __s0_285, 1, 0); \ + poly64x2_t __ret_285; \ + __ret_285 = __noswap_vsetq_lane_p64(__noswap_vget_lane_p64(__s2_285, __p3_285), __rev0_285, __p1_285); \ + __ret_285 = __builtin_shufflevector(__ret_285, __ret_285, 1, 0); \ + __ret_285; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_lane_f64(__p0_268, __p1_268, __p2_268, __p3_268) __extension__ ({ \ - float64x2_t __s0_268 = __p0_268; \ - float64x1_t __s2_268 = __p2_268; \ - float64x2_t __ret_268; \ - __ret_268 = vsetq_lane_f64(vget_lane_f64(__s2_268, __p3_268), __s0_268, __p1_268); \ - __ret_268; \ +#define vcopyq_lane_f64(__p0_286, __p1_286, __p2_286, __p3_286) __extension__ ({ \ + float64x2_t __s0_286 = __p0_286; \ + float64x1_t __s2_286 = __p2_286; \ + float64x2_t __ret_286; \ + __ret_286 = vsetq_lane_f64(vget_lane_f64(__s2_286, __p3_286), __s0_286, __p1_286); \ + __ret_286; \ }) #else -#define vcopyq_lane_f64(__p0_269, __p1_269, __p2_269, __p3_269) __extension__ ({ \ - float64x2_t __s0_269 = __p0_269; \ - float64x1_t __s2_269 = __p2_269; \ - float64x2_t __rev0_269; __rev0_269 = __builtin_shufflevector(__s0_269, __s0_269, 1, 0); \ - float64x2_t __ret_269; \ - __ret_269 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_269, __p3_269), __rev0_269, __p1_269); \ - __ret_269 = __builtin_shufflevector(__ret_269, __ret_269, 1, 0); \ - __ret_269; \ +#define vcopyq_lane_f64(__p0_287, __p1_287, __p2_287, __p3_287) __extension__ ({ \ + float64x2_t __s0_287 = __p0_287; \ + float64x1_t __s2_287 = __p2_287; \ + float64x2_t __rev0_287; __rev0_287 = __builtin_shufflevector(__s0_287, __s0_287, 1, 0); \ + float64x2_t __ret_287; \ + __ret_287 = __noswap_vsetq_lane_f64(__noswap_vget_lane_f64(__s2_287, __p3_287), __rev0_287, __p1_287); \ + __ret_287 = __builtin_shufflevector(__ret_287, __ret_287, 1, 0); \ + __ret_287; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_p64(__p0_270, __p1_270, __p2_270, __p3_270) __extension__ ({ \ - poly64x1_t __s0_270 = __p0_270; \ - poly64x1_t __s2_270 = __p2_270; \ - poly64x1_t __ret_270; \ - __ret_270 = vset_lane_p64(vget_lane_p64(__s2_270, __p3_270), __s0_270, __p1_270); \ - __ret_270; \ +#define vcopy_lane_p64(__p0_288, __p1_288, __p2_288, __p3_288) __extension__ ({ \ + poly64x1_t __s0_288 = __p0_288; \ + poly64x1_t __s2_288 = __p2_288; \ + poly64x1_t __ret_288; \ + __ret_288 = vset_lane_p64(vget_lane_p64(__s2_288, __p3_288), __s0_288, __p1_288); \ + __ret_288; \ }) #else -#define vcopy_lane_p64(__p0_271, __p1_271, __p2_271, __p3_271) __extension__ ({ \ - poly64x1_t __s0_271 = __p0_271; \ - poly64x1_t __s2_271 = __p2_271; \ - poly64x1_t __ret_271; \ - __ret_271 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_271, __p3_271), __s0_271, __p1_271); \ - __ret_271; \ +#define vcopy_lane_p64(__p0_289, __p1_289, __p2_289, __p3_289) __extension__ ({ \ + poly64x1_t __s0_289 = __p0_289; \ + poly64x1_t __s2_289 = __p2_289; \ + poly64x1_t __ret_289; \ + __ret_289 = __noswap_vset_lane_p64(__noswap_vget_lane_p64(__s2_289, __p3_289), __s0_289, __p1_289); \ + __ret_289; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_lane_f64(__p0_272, __p1_272, __p2_272, __p3_272) __extension__ ({ \ - float64x1_t __s0_272 = __p0_272; \ - float64x1_t __s2_272 = __p2_272; \ - float64x1_t __ret_272; \ - __ret_272 = vset_lane_f64(vget_lane_f64(__s2_272, __p3_272), __s0_272, __p1_272); \ - __ret_272; \ +#define vcopy_lane_f64(__p0_290, __p1_290, __p2_290, __p3_290) __extension__ ({ \ + float64x1_t __s0_290 = __p0_290; \ + float64x1_t __s2_290 = __p2_290; \ + float64x1_t __ret_290; \ + __ret_290 = vset_lane_f64(vget_lane_f64(__s2_290, __p3_290), __s0_290, __p1_290); \ + __ret_290; \ }) #else -#define vcopy_lane_f64(__p0_273, __p1_273, __p2_273, __p3_273) __extension__ ({ \ - float64x1_t __s0_273 = __p0_273; \ - float64x1_t __s2_273 = __p2_273; \ - float64x1_t __ret_273; \ - __ret_273 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_273, __p3_273), __s0_273, __p1_273); \ - __ret_273; \ +#define vcopy_lane_f64(__p0_291, __p1_291, __p2_291, __p3_291) __extension__ ({ \ + float64x1_t __s0_291 = __p0_291; \ + float64x1_t __s2_291 = __p2_291; \ + float64x1_t __ret_291; \ + __ret_291 = __noswap_vset_lane_f64(__noswap_vget_lane_f64(__s2_291, __p3_291), __s0_291, __p1_291); \ + __ret_291; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_p64(__p0_274, __p1_274, __p2_274, __p3_274) __extension__ ({ \ - poly64x2_t __s0_274 = __p0_274; \ - poly64x2_t __s2_274 = __p2_274; \ - poly64x2_t __ret_274; \ - __ret_274 = vsetq_lane_p64(vgetq_lane_p64(__s2_274, __p3_274), __s0_274, __p1_274); \ - __ret_274; \ +#define vcopyq_laneq_p64(__p0_292, __p1_292, __p2_292, __p3_292) __extension__ ({ \ + poly64x2_t __s0_292 = __p0_292; \ + poly64x2_t __s2_292 = __p2_292; \ + poly64x2_t __ret_292; \ + __ret_292 = vsetq_lane_p64(vgetq_lane_p64(__s2_292, __p3_292), __s0_292, __p1_292); \ + __ret_292; \ }) #else -#define vcopyq_laneq_p64(__p0_275, __p1_275, __p2_275, __p3_275) __extension__ ({ \ - poly64x2_t __s0_275 = __p0_275; \ - poly64x2_t __s2_275 = __p2_275; \ - poly64x2_t __rev0_275; __rev0_275 = __builtin_shufflevector(__s0_275, __s0_275, 1, 0); \ - poly64x2_t __rev2_275; __rev2_275 = __builtin_shufflevector(__s2_275, __s2_275, 1, 0); \ - poly64x2_t __ret_275; \ - __ret_275 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_275, __p3_275), __rev0_275, __p1_275); \ - __ret_275 = __builtin_shufflevector(__ret_275, __ret_275, 1, 0); \ - __ret_275; \ +#define vcopyq_laneq_p64(__p0_293, __p1_293, __p2_293, __p3_293) __extension__ ({ \ + poly64x2_t __s0_293 = __p0_293; \ + poly64x2_t __s2_293 = __p2_293; \ + poly64x2_t __rev0_293; __rev0_293 = __builtin_shufflevector(__s0_293, __s0_293, 1, 0); \ + poly64x2_t __rev2_293; __rev2_293 = __builtin_shufflevector(__s2_293, __s2_293, 1, 0); \ + poly64x2_t __ret_293; \ + __ret_293 = __noswap_vsetq_lane_p64(__noswap_vgetq_lane_p64(__rev2_293, __p3_293), __rev0_293, __p1_293); \ + __ret_293 = __builtin_shufflevector(__ret_293, __ret_293, 1, 0); \ + __ret_293; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopyq_laneq_f64(__p0_276, __p1_276, __p2_276, __p3_276) __extension__ ({ \ - float64x2_t __s0_276 = __p0_276; \ - float64x2_t __s2_276 = __p2_276; \ - float64x2_t __ret_276; \ - __ret_276 = vsetq_lane_f64(vgetq_lane_f64(__s2_276, __p3_276), __s0_276, __p1_276); \ - __ret_276; \ +#define vcopyq_laneq_f64(__p0_294, __p1_294, __p2_294, __p3_294) __extension__ ({ \ + float64x2_t __s0_294 = __p0_294; \ + float64x2_t __s2_294 = __p2_294; \ + float64x2_t __ret_294; \ + __ret_294 = vsetq_lane_f64(vgetq_lane_f64(__s2_294, __p3_294), __s0_294, __p1_294); \ + __ret_294; \ }) #else -#define vcopyq_laneq_f64(__p0_277, __p1_277, __p2_277, __p3_277) __extension__ ({ \ - float64x2_t __s0_277 = __p0_277; \ - float64x2_t __s2_277 = __p2_277; \ - float64x2_t __rev0_277; __rev0_277 = __builtin_shufflevector(__s0_277, __s0_277, 1, 0); \ - float64x2_t __rev2_277; __rev2_277 = __builtin_shufflevector(__s2_277, __s2_277, 1, 0); \ - float64x2_t __ret_277; \ - __ret_277 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_277, __p3_277), __rev0_277, __p1_277); \ - __ret_277 = __builtin_shufflevector(__ret_277, __ret_277, 1, 0); \ - __ret_277; \ +#define vcopyq_laneq_f64(__p0_295, __p1_295, __p2_295, __p3_295) __extension__ ({ \ + float64x2_t __s0_295 = __p0_295; \ + float64x2_t __s2_295 = __p2_295; \ + float64x2_t __rev0_295; __rev0_295 = __builtin_shufflevector(__s0_295, __s0_295, 1, 0); \ + float64x2_t __rev2_295; __rev2_295 = __builtin_shufflevector(__s2_295, __s2_295, 1, 0); \ + float64x2_t __ret_295; \ + __ret_295 = __noswap_vsetq_lane_f64(__noswap_vgetq_lane_f64(__rev2_295, __p3_295), __rev0_295, __p1_295); \ + __ret_295 = __builtin_shufflevector(__ret_295, __ret_295, 1, 0); \ + __ret_295; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_p64(__p0_278, __p1_278, __p2_278, __p3_278) __extension__ ({ \ - poly64x1_t __s0_278 = __p0_278; \ - poly64x2_t __s2_278 = __p2_278; \ - poly64x1_t __ret_278; \ - __ret_278 = vset_lane_p64(vgetq_lane_p64(__s2_278, __p3_278), __s0_278, __p1_278); \ - __ret_278; \ +#define vcopy_laneq_p64(__p0_296, __p1_296, __p2_296, __p3_296) __extension__ ({ \ + poly64x1_t __s0_296 = __p0_296; \ + poly64x2_t __s2_296 = __p2_296; \ + poly64x1_t __ret_296; \ + __ret_296 = vset_lane_p64(vgetq_lane_p64(__s2_296, __p3_296), __s0_296, __p1_296); \ + __ret_296; \ }) #else -#define vcopy_laneq_p64(__p0_279, __p1_279, __p2_279, __p3_279) __extension__ ({ \ - poly64x1_t __s0_279 = __p0_279; \ - poly64x2_t __s2_279 = __p2_279; \ - poly64x2_t __rev2_279; __rev2_279 = __builtin_shufflevector(__s2_279, __s2_279, 1, 0); \ - poly64x1_t __ret_279; \ - __ret_279 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_279, __p3_279), __s0_279, __p1_279); \ - __ret_279; \ +#define vcopy_laneq_p64(__p0_297, __p1_297, __p2_297, __p3_297) __extension__ ({ \ + poly64x1_t __s0_297 = __p0_297; \ + poly64x2_t __s2_297 = __p2_297; \ + poly64x2_t __rev2_297; __rev2_297 = __builtin_shufflevector(__s2_297, __s2_297, 1, 0); \ + poly64x1_t __ret_297; \ + __ret_297 = __noswap_vset_lane_p64(__noswap_vgetq_lane_p64(__rev2_297, __p3_297), __s0_297, __p1_297); \ + __ret_297; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vcopy_laneq_f64(__p0_280, __p1_280, __p2_280, __p3_280) __extension__ ({ \ - float64x1_t __s0_280 = __p0_280; \ - float64x2_t __s2_280 = __p2_280; \ - float64x1_t __ret_280; \ - __ret_280 = vset_lane_f64(vgetq_lane_f64(__s2_280, __p3_280), __s0_280, __p1_280); \ - __ret_280; \ +#define vcopy_laneq_f64(__p0_298, __p1_298, __p2_298, __p3_298) __extension__ ({ \ + float64x1_t __s0_298 = __p0_298; \ + float64x2_t __s2_298 = __p2_298; \ + float64x1_t __ret_298; \ + __ret_298 = vset_lane_f64(vgetq_lane_f64(__s2_298, __p3_298), __s0_298, __p1_298); \ + __ret_298; \ }) #else -#define vcopy_laneq_f64(__p0_281, __p1_281, __p2_281, __p3_281) __extension__ ({ \ - float64x1_t __s0_281 = __p0_281; \ - float64x2_t __s2_281 = __p2_281; \ - float64x2_t __rev2_281; __rev2_281 = __builtin_shufflevector(__s2_281, __s2_281, 1, 0); \ - float64x1_t __ret_281; \ - __ret_281 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_281, __p3_281), __s0_281, __p1_281); \ - __ret_281; \ +#define vcopy_laneq_f64(__p0_299, __p1_299, __p2_299, __p3_299) __extension__ ({ \ + float64x1_t __s0_299 = __p0_299; \ + float64x2_t __s2_299 = __p2_299; \ + float64x2_t __rev2_299; __rev2_299 = __builtin_shufflevector(__s2_299, __s2_299, 1, 0); \ + float64x1_t __ret_299; \ + __ret_299 = __noswap_vset_lane_f64(__noswap_vgetq_lane_f64(__rev2_299, __p3_299), __s0_299, __p1_299); \ + __ret_299; \ }) #endif @@ -68928,51 +72296,51 @@ __ai int32x4_t vmlsl_high_n_s16(int32x4_t __p0, int16x8_t __p1, int16_t __p2) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulx_lane_f64(__p0_282, __p1_282, __p2_282) __extension__ ({ \ - float64x1_t __s0_282 = __p0_282; \ - float64x1_t __s1_282 = __p1_282; \ - float64x1_t __ret_282; \ - float64_t __x_282 = vget_lane_f64(__s0_282, 0); \ - float64_t __y_282 = vget_lane_f64(__s1_282, __p2_282); \ - float64_t __z_282 = vmulxd_f64(__x_282, __y_282); \ - __ret_282 = vset_lane_f64(__z_282, __s0_282, __p2_282); \ - __ret_282; \ +#define vmulx_lane_f64(__p0_300, __p1_300, __p2_300) __extension__ ({ \ + float64x1_t __s0_300 = __p0_300; \ + float64x1_t __s1_300 = __p1_300; \ + float64x1_t __ret_300; \ + float64_t __x_300 = vget_lane_f64(__s0_300, 0); \ + float64_t __y_300 = vget_lane_f64(__s1_300, __p2_300); \ + float64_t __z_300 = vmulxd_f64(__x_300, __y_300); \ + __ret_300 = vset_lane_f64(__z_300, __s0_300, __p2_300); \ + __ret_300; \ }) #else -#define vmulx_lane_f64(__p0_283, __p1_283, __p2_283) __extension__ ({ \ - float64x1_t __s0_283 = __p0_283; \ - float64x1_t __s1_283 = __p1_283; \ - float64x1_t __ret_283; \ - float64_t __x_283 = __noswap_vget_lane_f64(__s0_283, 0); \ - float64_t __y_283 = __noswap_vget_lane_f64(__s1_283, __p2_283); \ - float64_t __z_283 = __noswap_vmulxd_f64(__x_283, __y_283); \ - __ret_283 = __noswap_vset_lane_f64(__z_283, __s0_283, __p2_283); \ - __ret_283; \ +#define vmulx_lane_f64(__p0_301, __p1_301, __p2_301) __extension__ ({ \ + float64x1_t __s0_301 = __p0_301; \ + float64x1_t __s1_301 = __p1_301; \ + float64x1_t __ret_301; \ + float64_t __x_301 = __noswap_vget_lane_f64(__s0_301, 0); \ + float64_t __y_301 = __noswap_vget_lane_f64(__s1_301, __p2_301); \ + float64_t __z_301 = __noswap_vmulxd_f64(__x_301, __y_301); \ + __ret_301 = __noswap_vset_lane_f64(__z_301, __s0_301, __p2_301); \ + __ret_301; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vmulx_laneq_f64(__p0_284, __p1_284, __p2_284) __extension__ ({ \ - float64x1_t __s0_284 = __p0_284; \ - float64x2_t __s1_284 = __p1_284; \ - float64x1_t __ret_284; \ - float64_t __x_284 = vget_lane_f64(__s0_284, 0); \ - float64_t __y_284 = vgetq_lane_f64(__s1_284, __p2_284); \ - float64_t __z_284 = vmulxd_f64(__x_284, __y_284); \ - __ret_284 = vset_lane_f64(__z_284, __s0_284, 0); \ - __ret_284; \ +#define vmulx_laneq_f64(__p0_302, __p1_302, __p2_302) __extension__ ({ \ + float64x1_t __s0_302 = __p0_302; \ + float64x2_t __s1_302 = __p1_302; \ + float64x1_t __ret_302; \ + float64_t __x_302 = vget_lane_f64(__s0_302, 0); \ + float64_t __y_302 = vgetq_lane_f64(__s1_302, __p2_302); \ + float64_t __z_302 = vmulxd_f64(__x_302, __y_302); \ + __ret_302 = vset_lane_f64(__z_302, __s0_302, 0); \ + __ret_302; \ }) #else -#define vmulx_laneq_f64(__p0_285, __p1_285, __p2_285) __extension__ ({ \ - float64x1_t __s0_285 = __p0_285; \ - float64x2_t __s1_285 = __p1_285; \ - float64x2_t __rev1_285; __rev1_285 = __builtin_shufflevector(__s1_285, __s1_285, 1, 0); \ - float64x1_t __ret_285; \ - float64_t __x_285 = __noswap_vget_lane_f64(__s0_285, 0); \ - float64_t __y_285 = __noswap_vgetq_lane_f64(__rev1_285, __p2_285); \ - float64_t __z_285 = __noswap_vmulxd_f64(__x_285, __y_285); \ - __ret_285 = __noswap_vset_lane_f64(__z_285, __s0_285, 0); \ - __ret_285; \ +#define vmulx_laneq_f64(__p0_303, __p1_303, __p2_303) __extension__ ({ \ + float64x1_t __s0_303 = __p0_303; \ + float64x2_t __s1_303 = __p1_303; \ + float64x2_t __rev1_303; __rev1_303 = __builtin_shufflevector(__s1_303, __s1_303, 1, 0); \ + float64x1_t __ret_303; \ + float64_t __x_303 = __noswap_vget_lane_f64(__s0_303, 0); \ + float64_t __y_303 = __noswap_vgetq_lane_f64(__rev1_303, __p2_303); \ + float64_t __z_303 = __noswap_vmulxd_f64(__x_303, __y_303); \ + __ret_303 = __noswap_vset_lane_f64(__z_303, __s0_303, 0); \ + __ret_303; \ }) #endif diff --git a/c_headers/avx512bitalgintrin.h b/c_headers/avx512bitalgintrin.h new file mode 100644 index 0000000000..2dd1471d2f --- /dev/null +++ b/c_headers/avx512bitalgintrin.h @@ -0,0 +1,97 @@ +/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512BITALGINTRIN_H +#define __AVX512BITALGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_popcnt_epi16(__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi) _mm512_popcnt_epi16(__B), + (__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) +{ + return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(), + __U, + __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_popcnt_epi8(__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, + (__v64qi) _mm512_popcnt_epi8(__B), + (__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) +{ + return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(), + __U, + __B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A, + (__v64qi) __B, + __U); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) +{ + return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1, + __A, + __B); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512bwintrin.h b/c_headers/avx512bwintrin.h index 53da5869d3..3ff0e3aafd 100644 --- a/c_headers/avx512bwintrin.h +++ b/c_headers/avx512bwintrin.h @@ -56,293 +56,145 @@ _mm512_setzero_hi(void) { /* Integer compare */ -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpeqb512_mask((__v64qi)__a, (__v64qi)__b, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 0, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpeqw512_mask((__v32hi)__a, (__v32hi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpeq_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpeq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 0, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpge_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpge_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpge_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpge_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 5, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_pcmpgtb512_mask((__v64qi)__a, (__v64qi)__b, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 6, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_pcmpgtw512_mask((__v32hi)__a, (__v32hi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpgt_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpgt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 6, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmple_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmple_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmple_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmple_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmple_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 2, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmplt_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - __u); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmplt_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - (__mmask64)-1); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmplt_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmplt_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmplt_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 1, - __u); -} +#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epi8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - (__mmask64)-1); -} +#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m)); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epi8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - __u); -} +#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epu8_mask(__m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - (__mmask64)-1); -} +#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m)); }) -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epu8_mask(__mmask64 __u, __m512i __a, __m512i __b) { - return (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)__a, (__v64qi)__b, 4, - __u); -} +#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epi16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - (__mmask32)-1); -} +#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m)); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epi16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - __u); -} +#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_cmpneq_epu16_mask(__m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - (__mmask32)-1); -} +#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m)); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_mask_cmpneq_epu16_mask(__mmask32 __u, __m512i __a, __m512i __b) { - return (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)__a, (__v32hi)__b, 4, - __u); -} +#define _mm512_cmpeq_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_add_epi8 (__m512i __A, __m512i __B) { @@ -1541,46 +1393,6 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) } -#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1); }) - -#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)); }) - -#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1); }) - -#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ - (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)); }) - -#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)); }) - -#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)); }) - #define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \ (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ (__v32hi)_mm512_undefined_epi32(), \ @@ -2042,15 +1854,13 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, - (__mmask64) __B); + return (__mmask64) (( __A & 0xFFFFFFFF) | ( __B << 32)); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, - (__mmask32) __B); +return (__mmask32) (( __A & 0xFFFF) | ( __B << 16)); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -2105,61 +1915,56 @@ _mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_test_epi8_mask (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, - (__v64qi) __B, __U); + return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_test_epi16_mask (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, - (__v32hi) __B, __U); + return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_testn_epi8_mask (__m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { - return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, - (__v64qi) __B, __U); + return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_testn_epi16_mask (__m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { - return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, - (__v32hi) __B, __U); + return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_qi()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512cdintrin.h b/c_headers/avx512cdintrin.h index 23c423584a..ec7e0cd443 100644 --- a/c_headers/avx512cdintrin.h +++ b/c_headers/avx512cdintrin.h @@ -130,13 +130,14 @@ _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcastmb_epi64 (__mmask8 __A) { - return (__m512i) __builtin_ia32_broadcastmb512 (__A); + return (__m512i) _mm512_set1_epi64((long long) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcastmw_epi32 (__mmask16 __A) { - return (__m512i) __builtin_ia32_broadcastmw512 (__A); + return (__m512i) _mm512_set1_epi32((int) __A); + } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512fintrin.h b/c_headers/avx512fintrin.h index f5137428ba..d34f0b1327 100644 --- a/c_headers/avx512fintrin.h +++ b/c_headers/avx512fintrin.h @@ -8787,7 +8787,7 @@ _mm512_kortestz (__mmask16 __A, __mmask16 __B) static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); + return (__mmask16) (( __A & 0xFF) | ( __B << 8)); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vbmi2intrin.h b/c_headers/avx512vbmi2intrin.h new file mode 100644 index 0000000000..43e97b40a0 --- /dev/null +++ b/c_headers/avx512vbmi2intrin.h @@ -0,0 +1,391 @@ +/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VBMI2INTRIN_H +#define __AVX512VBMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, + (__v32hi) _mm512_setzero_hi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, + (__v64qi) _mm512_setzero_qi(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D) +{ + __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D) +{ + __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, + (__v32hi) _mm512_setzero_hi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, + (__v64qi) _mm512_setzero_qi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, + (__v32hi) _mm512_setzero_hi(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, + (__v64qi) _mm512_setzero_qi(), + __U); +} + +#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \ + (__v8di)(B), \ + (int)(I), \ + (__v8di)(S), \ + (__mmask8)(U)); }) + +#define _mm512_maskz_shldi_epi64(U, A, B, I) \ + _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shldi_epi64(A, B, I) \ + _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \ + (__v16si)(B), \ + (int)(I), \ + (__v16si)(S), \ + (__mmask16)(U)); }) + +#define _mm512_maskz_shldi_epi32(U, A, B, I) \ + _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shldi_epi32(A, B, I) \ + _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) + +#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \ + (__v32hi)(B), \ + (int)(I), \ + (__v32hi)(S), \ + (__mmask32)(U)); }) + +#define _mm512_maskz_shldi_epi16(U, A, B, I) \ + _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shldi_epi16(A, B, I) \ + _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + +#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \ + (__v8di)(B), \ + (int)(I), \ + (__v8di)(S), \ + (__mmask8)(U)); }) + +#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ + _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shrdi_epi64(A, B, I) \ + _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \ + (__v16si)(B), \ + (int)(I), \ + (__v16si)(S), \ + (__mmask16)(U)); }) + +#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ + _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shrdi_epi32(A, B, I) \ + _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) + +#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \ + (__v32hi)(B), \ + (int)(I), \ + (__v32hi)(S), \ + (__mmask32)(U)); }) + +#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ + _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I)) + +#define _mm512_shrdi_epi16(A, B, I) \ + _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvq512_maskz ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi64(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvw512_maskz ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi16(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshldvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvq512_maskz ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi64(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvq512_mask ((__v8di) __S, + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi16(__m512i __S, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvw512_maskz ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi16(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpshrdvw512_mask ((__v32hi) __S, + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif + diff --git a/c_headers/avx512vlbitalgintrin.h b/c_headers/avx512vlbitalgintrin.h new file mode 100644 index 0000000000..76eb87721b --- /dev/null +++ b/c_headers/avx512vlbitalgintrin.h @@ -0,0 +1,157 @@ +/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VLBITALGINTRIN_H +#define __AVX512VLBITALGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_popcnt_epi16(__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, + (__v16hi) _mm256_popcnt_epi16(__B), + (__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) +{ + return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), + __U, + __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_popcnt_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, + (__v8hi) _mm128_popcnt_epi16(__B), + (__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) +{ + return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), + __U, + __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_popcnt_epi8(__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, + (__v32qi) _mm256_popcnt_epi8(__B), + (__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) +{ + return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), + __U, + __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_popcnt_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, + (__v16qi) _mm128_popcnt_epi8(__B), + (__v16qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) +{ + return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), + __U, + __B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A, + (__v32qi) __B, + __U); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B) +{ + return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1, + __A, + __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A, + (__v16qi) __B, + __U); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B) +{ + return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1, + __A, + __B); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vlbwintrin.h b/c_headers/avx512vlbwintrin.h index 4ab785bdbb..e940e2b685 100644 --- a/c_headers/avx512vlbwintrin.h +++ b/c_headers/avx512vlbwintrin.h @@ -38,581 +38,285 @@ _mm_setzero_hi(void){ /* Integer compare */ -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpeqb128_mask((__v16qi)__a, (__v16qi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 0, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpeqb256_mask((__v32qi)__a, (__v32qi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqw128_mask((__v8hi)__a, (__v8hi)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 0, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpeqw256_mask((__v16hi)__a, (__v16hi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 0, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 5, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_pcmpgtb128_mask((__v16qi)__a, (__v16qi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 6, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_pcmpgtb256_mask((__v32qi)__a, (__v32qi)__b, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtw128_mask((__v8hi)__a, (__v8hi)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 6, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_pcmpgtw256_mask((__v16hi)__a, (__v16hi)__b, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 6, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmple_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmple_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 2, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - __u); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - (__mmask32)-1); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - __u); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - (__mmask16)-1); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 1, - __u); -} +#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - (__mmask16)-1); -} +#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m)); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - __u); -} +#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - (__mmask16)-1); -} +#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m)); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - return (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)__a, (__v16qi)__b, 4, - __u); -} +#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - (__mmask32)-1); -} +#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m)); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - __u); -} +#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - (__mmask32)-1); -} +#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ + (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m)); }) -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - return (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)__a, (__v32qi)__b, 4, - __u); -} +#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1); }) -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - (__mmask8)-1); -} +#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m)); }) -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - __u); -} +#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1); }) -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - (__mmask8)-1); -} +#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m)); }) -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)__a, (__v8hi)__b, 4, - __u); -} +#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - (__mmask16)-1); -} +#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m)); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - __u); -} +#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - (__mmask16)-1); -} +#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ + (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m)); }) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - return (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)__a, (__v16hi)__b, 4, - __u); -} +#define _mm_cmpeq_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ @@ -2146,86 +1850,6 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) } -#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)); }) - -#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)); }) - -#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)); }) - -#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1); }) - -#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ - (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)); }) - -#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) - -#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) - -#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) - -#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ - (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) - -#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)); }) - -#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1); }) - -#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ - (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)); }) - #define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ @@ -2791,121 +2415,108 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_test_epi8_mask (__m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, - (__v16qi) __B, - (__mmask16) -1); + return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, - (__v16qi) __B, __U); + return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_hi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_test_epi8_mask (__m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, - (__v32qi) __B, - (__mmask32) -1); + return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, - (__v32qi) __B, __U); + return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_test_epi16_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, - (__v8hi) __B, __U); + return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_test_epi16_mask (__m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256 ()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, - (__v16hi) __B, __U); + return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_testn_epi8_mask (__m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, - (__v16qi) __B, - (__mmask16) -1); + return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) { - return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, - (__v16qi) __B, __U); + return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_hi()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_testn_epi8_mask (__m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, - (__v32qi) __B, - (__mmask32) -1); + return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) { - return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, - (__v32qi) __B, __U); + return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_testn_epi16_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, - (__v8hi) __B, __U); + return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_hi()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_testn_epi16_mask (__m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) { - return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, - (__v16hi) __B, __U); + return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask16 __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vlcdintrin.h b/c_headers/avx512vlcdintrin.h index 7b02e2e1f9..8f1cd25f0b 100644 --- a/c_headers/avx512vlcdintrin.h +++ b/c_headers/avx512vlcdintrin.h @@ -33,26 +33,26 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcastmb_epi64 (__mmask8 __A) -{ - return (__m128i) __builtin_ia32_broadcastmb128 (__A); +{ + return (__m128i) _mm_set1_epi64x((long long) __A); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcastmb_epi64 (__mmask8 __A) { - return (__m256i) __builtin_ia32_broadcastmb256 (__A); + return (__m256i) _mm256_set1_epi64x((long long)__A); } static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_broadcastmw_epi32 (__mmask16 __A) { - return (__m128i) __builtin_ia32_broadcastmw128 (__A); + return (__m128i) _mm_set1_epi32((int)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_broadcastmw_epi32 (__mmask16 __A) { - return (__m256i) __builtin_ia32_broadcastmw256 (__A); + return (__m256i) _mm256_set1_epi32((int)__A); } diff --git a/c_headers/avx512vlintrin.h b/c_headers/avx512vlintrin.h index 7e17cff05f..fb8056e3f8 100644 --- a/c_headers/avx512vlintrin.h +++ b/c_headers/avx512vlintrin.h @@ -38,582 +38,205 @@ _mm_setzero_di(void) { /* Integer compare */ -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd128_mask((__v4si)__a, (__v4si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqd256_mask((__v8si)__a, (__v8si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq128_mask((__v2di)__a, (__v2di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpeq_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpeq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 0, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpeqq256_mask((__v4di)__a, (__v4di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpeq_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpeq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 0, - __u); -} - - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpge_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpge_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpge_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpge_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 5, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd128_mask((__v4si)__a, (__v4si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtd256_mask((__v8si)__a, (__v8si)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq128_mask((__v2di)__a, (__v2di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpgt_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpgt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_pcmpgtq256_mask((__v4di)__a, (__v4di)__b, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpgt_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpgt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 6, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmple_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmple_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmple_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmple_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 2, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmplt_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmplt_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmplt_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmplt_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 1, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)__a, (__v4si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu32_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu32_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)__a, (__v4si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)__a, (__v8si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu32_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu32_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)__a, (__v8si)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epi64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epi64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)__a, (__v2di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_cmpneq_epu64_mask(__m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm_mask_cmpneq_epu64_mask(__mmask8 __u, __m128i __a, __m128i __b) { - return (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)__a, (__v2di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epi64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epi64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)__a, (__v4di)__b, 4, - __u); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_cmpneq_epu64_mask(__m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4, - (__mmask8)-1); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_mm256_mask_cmpneq_epu64_mask(__mmask8 __u, __m256i __a, __m256i __b) { - return (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)__a, (__v4di)__b, 4, - __u); -} +#define _mm_cmpeq_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) @@ -6503,125 +6126,111 @@ _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_test_epi32_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, - (__v4si) __B, __U); + return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_test_epi32_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, - (__v8si) __B, __U); + return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_test_epi64_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, - (__v2di) __B, __U); + return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_test_epi64_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, - (__v4di) __B, __U); + return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_testn_epi32_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, - (__v4si) __B, __U); + return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_testn_epi32_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, - (__v8si) __B, __U); + return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_testn_epi64_mask (__m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, - (__v2di) __B, - (__mmask8) -1); + return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, - (__v2di) __B, __U); + return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_di()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_testn_epi64_mask (__m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, - (__v4di) __B, - (__mmask8) -1); + return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } static __inline__ __mmask8 __DEFAULT_FN_ATTRS _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) { - return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, - (__v4di) __B, __U); + return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); } - - static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { @@ -6977,85 +6586,81 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm) #define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 4), \ + 1 + ((((imm) >> 0) & 0x1) * 4), \ + 2 + ((((imm) >> 0) & 0x1) * 4), \ + 3 + ((((imm) >> 0) & 0x1) * 4), \ + 8 + ((((imm) >> 1) & 0x1) * 4), \ + 9 + ((((imm) >> 1) & 0x1) * 4), \ + 10 + ((((imm) >> 1) & 0x1) * 4), \ + 11 + ((((imm) >> 1) & 0x1) * 4)); }) #define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W)); }) #define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ - (__m256)__builtin_ia32_shuf_f32x4_256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps()); }) #define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)(__m256)(W)); }) #define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ - (__m256d)__builtin_ia32_shuf_f64x2_256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd()); }) #define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)(__m256)(W)); }) #define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i32x4_256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256()); }) #define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) + (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + 0 + ((((imm) >> 0) & 0x1) * 2), \ + 1 + ((((imm) >> 0) & 0x1) * 2), \ + 4 + ((((imm) >> 1) & 0x1) * 2), \ + 5 + ((((imm) >> 1) & 0x1) * 2)); }) #define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)(__m256)(W)); }) + #define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_shuf_i64x2_256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256()); }) #define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ diff --git a/c_headers/avx512vlvbmi2intrin.h b/c_headers/avx512vlvbmi2intrin.h new file mode 100644 index 0000000000..d1ec4976f2 --- /dev/null +++ b/c_headers/avx512vlvbmi2intrin.h @@ -0,0 +1,748 @@ +/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VLVBMI2INTRIN_H +#define __AVX512VLVBMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"))) + +static __inline __m128i __DEFAULT_FN_ATTRS +_mm128_setzero_hi(void) { + return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, + (__v8hi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, + (__v16qi) _mm128_setzero_hi(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) +{ + __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) +{ + __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, + (__v8hi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, + (__v16qi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, + (__v8hi) _mm128_setzero_hi(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, + (__v16qi) _mm128_setzero_hi(), + __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setzero_hi(void) { + return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, + (__v16hi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, + (__v32qi) _mm256_setzero_hi(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) +{ + __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) +{ + __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, + (__v16hi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, + (__v32qi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, + (__v16hi) _mm256_setzero_hi(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, + (__v32qi) _mm256_setzero_hi(), + __U); +} + +#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \ + (__v4di)(B), \ + (int)(I), \ + (__v4di)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shldi_epi64(U, A, B, I) \ + _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shldi_epi64(A, B, I) \ + _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \ + (__v2di)(B), \ + (int)(I), \ + (__v2di)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shldi_epi64(U, A, B, I) \ + _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shldi_epi64(A, B, I) \ + _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \ + (__v8si)(B), \ + (int)(I), \ + (__v8si)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shldi_epi32(U, A, B, I) \ + _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shldi_epi32(A, B, I) \ + _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \ + (__v4si)(B), \ + (int)(I), \ + (__v4si)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shldi_epi32(U, A, B, I) \ + _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shldi_epi32(A, B, I) \ + _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \ + (__v16hi)(B), \ + (int)(I), \ + (__v16hi)(S), \ + (__mmask16)(U)); }) + +#define _mm256_maskz_shldi_epi16(U, A, B, I) \ + _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shldi_epi16(A, B, I) \ + _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \ + (__v8hi)(B), \ + (int)(I), \ + (__v8hi)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shldi_epi16(U, A, B, I) \ + _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shldi_epi16(A, B, I) \ + _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \ + (__v4di)(B), \ + (int)(I), \ + (__v4di)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ + _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shrdi_epi64(A, B, I) \ + _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \ + (__v2di)(B), \ + (int)(I), \ + (__v2di)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shrdi_epi64(U, A, B, I) \ + _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shrdi_epi64(A, B, I) \ + _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \ + (__v8si)(B), \ + (int)(I), \ + (__v8si)(S), \ + (__mmask8)(U)); }) + +#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ + _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shrdi_epi32(A, B, I) \ + _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \ + (__v4si)(B), \ + (int)(I), \ + (__v4si)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shrdi_epi32(U, A, B, I) \ + _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shrdi_epi32(A, B, I) \ + _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \ + (__v16hi)(B), \ + (int)(I), \ + (__v16hi)(S), \ + (__mmask16)(U)); }) + +#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ + _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I)) + +#define _mm256_shrdi_epi16(A, B, I) \ + _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + +#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \ + (__v8hi)(B), \ + (int)(I), \ + (__v8hi)(S), \ + (__mmask8)(U)); }) + +#define _mm128_maskz_shrdi_epi16(U, A, B, I) \ + _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I)) + +#define _mm128_shrdi_epi16(A, B, I) \ + _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vlvnniintrin.h b/c_headers/avx512vlvnniintrin.h new file mode 100644 index 0000000000..745ae8b7ad --- /dev/null +++ b/c_headers/avx512vlvnniintrin.h @@ -0,0 +1,254 @@ +/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VLVNNIINTRIN_H +#define __AVX512VLVNNIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"))) + + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S, + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S, + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vnniintrin.h b/c_headers/avx512vnniintrin.h new file mode 100644 index 0000000000..0c6badd231 --- /dev/null +++ b/c_headers/avx512vnniintrin.h @@ -0,0 +1,146 @@ +/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VNNIINTRIN_H +#define __AVX512VNNIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S, + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/avx512vpopcntdqvlintrin.h b/c_headers/avx512vpopcntdqvlintrin.h new file mode 100644 index 0000000000..c2058a8f51 --- /dev/null +++ b/c_headers/avx512vpopcntdqvlintrin.h @@ -0,0 +1,99 @@ +/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics + *------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __AVX512VPOPCNTDQVLINTRIN_H +#define __AVX512VPOPCNTDQVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectq_128( + (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { + return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { + return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectq_256( + (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { + return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { + return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/cetintrin.h b/c_headers/cetintrin.h new file mode 100644 index 0000000000..1256a3f63a --- /dev/null +++ b/c_headers/cetintrin.h @@ -0,0 +1,93 @@ +/*===---- cetintrin.h - CET intrinsic ------------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <cetintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __CETINTRIN_H +#define __CETINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("shstk"))) + +static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) { + __builtin_ia32_incsspd(__a); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) { + __builtin_ia32_incsspq(__a); +} +#endif /* __x86_64__ */ + +static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) { + return __builtin_ia32_rdsspd(__a); +} + +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) { + return __builtin_ia32_rdsspq(__a); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() { + __builtin_ia32_saveprevssp(); +} + +static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) { + __builtin_ia32_rstorssp(__p); +} + +static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) { + __builtin_ia32_wrssd(__a, __p); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) { + __builtin_ia32_wrssq(__a, __p); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) { + __builtin_ia32_wrussd(__a, __p); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) { + __builtin_ia32_wrussq(__a, __p); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() { + __builtin_ia32_setssbsy(); +} + +static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) { + __builtin_ia32_clrssbsy(__p); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __CETINTRIN_H */ diff --git a/c_headers/cpuid.h b/c_headers/cpuid.h index 2dd0add236..3ae90de0b9 100644 --- a/c_headers/cpuid.h +++ b/c_headers/cpuid.h @@ -173,16 +173,24 @@ #define bit_AVX512VL 0x80000000 /* Features in %ecx for leaf 7 sub-leaf 0 */ -#define bit_PREFTCHWT1 0x00000001 -#define bit_AVX512VBMI 0x00000002 -#define bit_PKU 0x00000004 -#define bit_OSPKE 0x00000010 +#define bit_PREFTCHWT1 0x00000001 +#define bit_AVX512VBMI 0x00000002 +#define bit_PKU 0x00000004 +#define bit_OSPKE 0x00000010 +#define bit_AVX512VBMI2 0x00000040 +#define bit_SHSTK 0x00000080 +#define bit_GFNI 0x00000100 +#define bit_VAES 0x00000200 +#define bit_VPCLMULQDQ 0x00000400 +#define bit_AVX512VNNI 0x00000800 +#define bit_AVX512BITALG 0x00001000 #define bit_AVX512VPOPCNTDQ 0x00004000 -#define bit_RDPID 0x00400000 +#define bit_RDPID 0x00400000 /* Features in %edx for leaf 7 sub-leaf 0 */ #define bit_AVX5124VNNIW 0x00000004 #define bit_AVX5124FMAPS 0x00000008 +#define bit_IBT 0x00100000 /* Features in %eax for leaf 13 sub-leaf 1 */ #define bit_XSAVEOPT 0x00000001 @@ -192,6 +200,7 @@ /* Features in %ecx for leaf 0x80000001 */ #define bit_LAHF_LM 0x00000001 #define bit_ABM 0x00000020 +#define bit_LZCNT bit_ABM /* for gcc compat */ #define bit_SSE4a 0x00000040 #define bit_PRFCHW 0x00000100 #define bit_XOP 0x00000800 diff --git a/c_headers/cuda_wrappers/algorithm b/c_headers/cuda_wrappers/algorithm index 95d9beb73c..cedd70762c 100644 --- a/c_headers/cuda_wrappers/algorithm +++ b/c_headers/cuda_wrappers/algorithm @@ -80,7 +80,7 @@ min(const __T &__a, const __T &__b, __Cmp __cmp) { template <class __T> inline __device__ const __T & min(const __T &__a, const __T &__b) { - return __a < __b ? __b : __a; + return __a < __b ? __a : __b; } #ifdef _LIBCPP_END_NAMESPACE_STD diff --git a/c_headers/emmintrin.h b/c_headers/emmintrin.h index 3372508a7f..b332eeec20 100644 --- a/c_headers/emmintrin.h +++ b/c_headers/emmintrin.h @@ -217,8 +217,8 @@ _mm_div_pd(__m128d __a, __m128d __b) /// \brief Calculates the square root of the lower double-precision value of /// the second operand and returns it in the lower 64 bits of the result. -/// The upper 64 bits of the result are copied from the upper double- -/// precision value of the first operand. +/// The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. /// /// \headerfile <x86intrin.h> /// @@ -260,8 +260,8 @@ _mm_sqrt_pd(__m128d __a) /// \brief Compares lower 64-bit double-precision values of both operands, and /// returns the lesser of the pair of values in the lower 64-bits of the -/// result. The upper 64 bits of the result are copied from the upper double- -/// precision value of the first operand. +/// result. The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. /// /// \headerfile <x86intrin.h> /// @@ -304,8 +304,8 @@ _mm_min_pd(__m128d __a, __m128d __b) /// \brief Compares lower 64-bit double-precision values of both operands, and /// returns the greater of the pair of values in the lower 64-bits of the -/// result. The upper 64 bits of the result are copied from the upper double- -/// precision value of the first operand. +/// result. The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. /// /// \headerfile <x86intrin.h> /// @@ -983,8 +983,10 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b) } /// \brief Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] for equality. The -/// comparison yields 0 for false, 1 for true. +/// the two 128-bit floating-point vectors of [2 x double] for equality. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -996,7 +998,8 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, __m128d __b) { @@ -1008,7 +1011,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b) /// the value in the first parameter is less than the corresponding value in /// the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1020,7 +1024,8 @@ _mm_comieq_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, __m128d __b) { @@ -1032,7 +1037,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b) /// the value in the first parameter is less than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1044,7 +1050,8 @@ _mm_comilt_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, __m128d __b) { @@ -1056,7 +1063,8 @@ _mm_comile_sd(__m128d __a, __m128d __b) /// the value in the first parameter is greater than the corresponding value /// in the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1068,7 +1076,8 @@ _mm_comile_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, __m128d __b) { @@ -1080,7 +1089,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b) /// the value in the first parameter is greater than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1092,7 +1102,8 @@ _mm_comigt_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, __m128d __b) { @@ -1104,7 +1115,8 @@ _mm_comige_sd(__m128d __a, __m128d __b) /// the value in the first parameter is unequal to the corresponding value in /// the second parameter. /// -/// The comparison yields 0 for false, 1 for true. +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 1 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1116,7 +1128,8 @@ _mm_comige_sd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, __m128d __b) { @@ -1127,7 +1140,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b) /// the two 128-bit floating-point vectors of [2 x double] for equality. The /// comparison yields 0 for false, 1 for true. /// -/// If either of the two lower double-precision values is NaN, 1 is returned. +/// If either of the two lower double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1140,7 +1153,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, __m128d __b) { @@ -1153,7 +1166,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b) /// the second parameter. /// /// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 1 is returned. +/// double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1166,7 +1179,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, __m128d __b) { @@ -1179,7 +1192,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b) /// corresponding value in the second parameter. /// /// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 1 is returned. +/// double-precision values is NaN, 0 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1192,7 +1205,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. +/// lower double-precision values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, __m128d __b) { @@ -1257,7 +1270,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b) /// the second parameter. /// /// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 0 is returned. +/// double-precision values is NaN, 1 is returned. /// /// \headerfile <x86intrin.h> /// @@ -1270,7 +1283,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b) /// A 128-bit vector of [2 x double]. The lower double-precision value is /// compared to the lower double-precision value of \a __a. /// \returns An integer containing the comparison result. If either of the two -/// lower double-precision values is NaN, 0 is returned. +/// lower double-precision values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, __m128d __b) { @@ -1935,14 +1948,15 @@ _mm_store_pd(double *__dp, __m128d __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c>VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. +/// This intrinsic corresponds to the +/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. /// /// \param __dp /// A pointer to a memory location that can store two double-precision /// values. /// \param __a /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each -/// of the values in \a dp. +/// of the values in \a __dp. static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, __m128d __a) { @@ -1950,18 +1964,20 @@ _mm_store1_pd(double *__dp, __m128d __a) _mm_store_pd(__dp, __a); } -/// \brief Stores a 128-bit vector of [2 x double] into an aligned memory -/// location. +/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// the upper and lower 64 bits of a memory location. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. +/// This intrinsic corresponds to the +/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. /// /// \param __dp -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 16-byte aligned. +/// A pointer to a memory location that can store two double-precision +/// values. /// \param __a -/// A 128-bit vector of [2 x double] containing the values to be stored. +/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each +/// of the values in \a __dp. static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a) { @@ -3846,8 +3862,7 @@ _mm_set1_epi8(char __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> -/// instruction. +/// This intrinsic does not correspond to a specific instruction. /// /// \param __q0 /// A 64-bit integral value used to initialize the lower 64 bits of the @@ -4018,7 +4033,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) /// specified unaligned memory location. When a mask bit is 1, the /// corresponding byte is written, otherwise it is not written. /// -/// To minimize caching, the date is flagged as non-temporal (unlikely to be +/// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). Exception and trap behavior for elements not selected /// for storage to memory are implementation dependent. /// @@ -4532,8 +4547,8 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); } -/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors -/// of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. +/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. /// /// \headerfile <x86intrin.h> /// @@ -4665,7 +4680,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic has no corresponding instruction. +/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. /// /// \param __a /// A 128-bit integer vector operand. The lower 64 bits are moved to the @@ -4682,7 +4697,7 @@ _mm_movepi64_pi64(__m128i __a) /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVQ / MOVQ / MOVD </c> instruction. +/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. /// /// \param __a /// A 64-bit value. @@ -4712,8 +4727,8 @@ _mm_move_epi64(__m128i __a) return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); } -/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors -/// of [2 x double] and interleaves them into a 128-bit vector of [2 x +/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// [2 x double] and interleaves them into a 128-bit vector of [2 x /// double]. /// /// \headerfile <x86intrin.h> @@ -4733,7 +4748,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b) return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); } -/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors +/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors /// of [2 x double] and interleaves them into a 128-bit vector of [2 x /// double]. /// @@ -4792,9 +4807,9 @@ _mm_movemask_pd(__m128d __a) /// A 128-bit vector of [2 x double]. /// \param i /// An 8-bit immediate value. The least significant two bits specify which -/// elements to copy from a and b: \n -/// Bit[0] = 0: lower element of a copied to lower element of result. \n -/// Bit[0] = 1: upper element of a copied to lower element of result. \n +/// elements to copy from \a a and \a b: \n +/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n +/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n /// \returns A 128-bit vector of [2 x double] containing the shuffled values. diff --git a/c_headers/fma4intrin.h b/c_headers/fma4intrin.h index 11aa8ceacf..962b1a60a2 100644 --- a/c_headers/fma4intrin.h +++ b/c_headers/fma4intrin.h @@ -60,73 +60,73 @@ _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -144,13 +144,13 @@ _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -168,37 +168,37 @@ _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -216,13 +216,13 @@ _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/fmaintrin.h b/c_headers/fmaintrin.h index 0e2ef0b171..478a0ac81c 100644 --- a/c_headers/fmaintrin.h +++ b/c_headers/fmaintrin.h @@ -46,85 +46,85 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfnmsubss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfnmsubsd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); } static __inline__ __m128 __DEFAULT_FN_ATTRS @@ -142,13 +142,13 @@ _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { - return (__m128)__builtin_ia32_vfmsubaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { - return (__m128d)__builtin_ia32_vfmsubaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -166,37 +166,37 @@ _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfnmsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfnmsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -214,13 +214,13 @@ _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { - return (__m256)__builtin_ia32_vfmsubaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } static __inline__ __m256d __DEFAULT_FN_ATTRS _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { - return (__m256d)__builtin_ia32_vfmsubaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/gfniintrin.h b/c_headers/gfniintrin.h new file mode 100644 index 0000000000..20fadccfaa --- /dev/null +++ b/c_headers/gfniintrin.h @@ -0,0 +1,202 @@ +/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __GFNIINTRIN_H +#define __GFNIINTRIN_H + + +#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I)); }) + +#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S)); }) + + +#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I); }) + + +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I)); }) + +#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S)); }) + +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I); }) + + +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I)); }) + +#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S)); }) + +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(), \ + U, A, B, I); }) + +#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I)); }) + +#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S)); }) + + +#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I); }) + + +#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I)); }) + +#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S)); }) + +#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I); }) + + +#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I)); }) + +#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S)); }) + +#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ + (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(), \ + U, A, B, I); }) + +/* Default attributes for simple form (no masking). */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"))) + +/* Default attributes for VLX forms. */ +#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, + (__v16qi) __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL +_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128(__U, + (__v16qi) _mm_gf2p8mul_epi8(__A, __B), + (__v16qi) __S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL +_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), + __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, + (__v32qi) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL +_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256(__U, + (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), + (__v32qi) __S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL +_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), + __U, __A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F +_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A, + (__v64qi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F +_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectb_512(__U, + (__v64qi) _mm512_gf2p8mul_epi8(__A, __B), + (__v64qi) __S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F +_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(), + __U, __A, __B); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_F +#undef __DEFAULT_FN_ATTRS_VL + +#endif // __GFNIINTRIN_H + diff --git a/c_headers/immintrin.h b/c_headers/immintrin.h index d86e0efb82..d3421dc86c 100644 --- a/c_headers/immintrin.h +++ b/c_headers/immintrin.h @@ -118,6 +118,10 @@ _mm256_cvtph_ps(__m128i __a) } #endif /* __AVX2__ */ +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__) +#include <vpclmulqdqintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) #include <bmiintrin.h> #endif @@ -146,6 +150,10 @@ _mm256_cvtph_ps(__m128i __a) #include <avx512bwintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512BITALG__) +#include <avx512bitalgintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512CD__) #include <avx512cdintrin.h> #endif @@ -154,11 +162,30 @@ _mm256_cvtph_ps(__m128i __a) #include <avx512vpopcntdqintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__)) +#include <avx512vpopcntdqvlintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VNNI__) +#include <avx512vnniintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VNNI__)) +#include <avx512vlvnniintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512DQ__) #include <avx512dqintrin.h> #endif #if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512BITALG__)) +#include <avx512vlbitalgintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ (defined(__AVX512VL__) && defined(__AVX512BW__)) #include <avx512vlbwintrin.h> #endif @@ -195,6 +222,15 @@ _mm256_cvtph_ps(__m128i __a) #include <avx512vbmivlintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512VBMI2__) +#include <avx512vbmi2intrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + (defined(__AVX512VBMI2__) && defined(__AVX512VL__)) +#include <avx512vlvbmi2intrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX512PF__) #include <avx512pfintrin.h> #endif @@ -203,6 +239,14 @@ _mm256_cvtph_ps(__m128i __a) #include <pkuintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__VAES__) +#include <vaesintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__GFNI__) +#include <gfniintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__) static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) _rdrand16_step(unsigned short *__p) @@ -319,6 +363,10 @@ _writegsbase_u64(unsigned long long __V) #include <xsavesintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SHSTK__) +#include <cetintrin.h> +#endif + /* Some intrinsics inside adxintrin.h are available only on processors with ADX, * whereas others are also available at all times. */ #include <adxintrin.h> diff --git a/c_headers/opencl-c.h b/c_headers/opencl-c.h index 35fb0a82bc..ce204b04c0 100644 --- a/c_headers/opencl-c.h +++ b/c_headers/opencl-c.h @@ -15886,6 +15886,313 @@ double __ovld __conv sub_group_scan_inclusive_max(double x); #endif //cl_khr_subgroups cl_intel_subgroups +#if defined(cl_intel_subgroups) +// Intel-Specific Sub Group Functions +float __ovld __conv intel_sub_group_shuffle( float x, uint c ); +float2 __ovld __conv intel_sub_group_shuffle( float2 x, uint c ); +float3 __ovld __conv intel_sub_group_shuffle( float3 x, uint c ); +float4 __ovld __conv intel_sub_group_shuffle( float4 x, uint c ); +float8 __ovld __conv intel_sub_group_shuffle( float8 x, uint c ); +float16 __ovld __conv intel_sub_group_shuffle( float16 x, uint c ); + +int __ovld __conv intel_sub_group_shuffle( int x, uint c ); +int2 __ovld __conv intel_sub_group_shuffle( int2 x, uint c ); +int3 __ovld __conv intel_sub_group_shuffle( int3 x, uint c ); +int4 __ovld __conv intel_sub_group_shuffle( int4 x, uint c ); +int8 __ovld __conv intel_sub_group_shuffle( int8 x, uint c ); +int16 __ovld __conv intel_sub_group_shuffle( int16 x, uint c ); + +uint __ovld __conv intel_sub_group_shuffle( uint x, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle( uint2 x, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle( uint3 x, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle( uint4 x, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle( uint8 x, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle( uint16 x, uint c ); + +long __ovld __conv intel_sub_group_shuffle( long x, uint c ); +ulong __ovld __conv intel_sub_group_shuffle( ulong x, uint c ); + +float __ovld __conv intel_sub_group_shuffle_down( float cur, float next, uint c ); +float2 __ovld __conv intel_sub_group_shuffle_down( float2 cur, float2 next, uint c ); +float3 __ovld __conv intel_sub_group_shuffle_down( float3 cur, float3 next, uint c ); +float4 __ovld __conv intel_sub_group_shuffle_down( float4 cur, float4 next, uint c ); +float8 __ovld __conv intel_sub_group_shuffle_down( float8 cur, float8 next, uint c ); +float16 __ovld __conv intel_sub_group_shuffle_down( float16 cur, float16 next, uint c ); + +int __ovld __conv intel_sub_group_shuffle_down( int cur, int next, uint c ); +int2 __ovld __conv intel_sub_group_shuffle_down( int2 cur, int2 next, uint c ); +int3 __ovld __conv intel_sub_group_shuffle_down( int3 cur, int3 next, uint c ); +int4 __ovld __conv intel_sub_group_shuffle_down( int4 cur, int4 next, uint c ); +int8 __ovld __conv intel_sub_group_shuffle_down( int8 cur, int8 next, uint c ); +int16 __ovld __conv intel_sub_group_shuffle_down( int16 cur, int16 next, uint c ); + +uint __ovld __conv intel_sub_group_shuffle_down( uint cur, uint next, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle_down( uint2 cur, uint2 next, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle_down( uint3 cur, uint3 next, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle_down( uint4 cur, uint4 next, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle_down( uint8 cur, uint8 next, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle_down( uint16 cur, uint16 next, uint c ); + +long __ovld __conv intel_sub_group_shuffle_down( long prev, long cur, uint c ); +ulong __ovld __conv intel_sub_group_shuffle_down( ulong prev, ulong cur, uint c ); + +float __ovld __conv intel_sub_group_shuffle_up( float prev, float cur, uint c ); +float2 __ovld __conv intel_sub_group_shuffle_up( float2 prev, float2 cur, uint c ); +float3 __ovld __conv intel_sub_group_shuffle_up( float3 prev, float3 cur, uint c ); +float4 __ovld __conv intel_sub_group_shuffle_up( float4 prev, float4 cur, uint c ); +float8 __ovld __conv intel_sub_group_shuffle_up( float8 prev, float8 cur, uint c ); +float16 __ovld __conv intel_sub_group_shuffle_up( float16 prev, float16 cur, uint c ); + +int __ovld __conv intel_sub_group_shuffle_up( int prev, int cur, uint c ); +int2 __ovld __conv intel_sub_group_shuffle_up( int2 prev, int2 cur, uint c ); +int3 __ovld __conv intel_sub_group_shuffle_up( int3 prev, int3 cur, uint c ); +int4 __ovld __conv intel_sub_group_shuffle_up( int4 prev, int4 cur, uint c ); +int8 __ovld __conv intel_sub_group_shuffle_up( int8 prev, int8 cur, uint c ); +int16 __ovld __conv intel_sub_group_shuffle_up( int16 prev, int16 cur, uint c ); + +uint __ovld __conv intel_sub_group_shuffle_up( uint prev, uint cur, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle_up( uint2 prev, uint2 cur, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle_up( uint3 prev, uint3 cur, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle_up( uint4 prev, uint4 cur, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle_up( uint8 prev, uint8 cur, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle_up( uint16 prev, uint16 cur, uint c ); + +long __ovld __conv intel_sub_group_shuffle_up( long prev, long cur, uint c ); +ulong __ovld __conv intel_sub_group_shuffle_up( ulong prev, ulong cur, uint c ); + +float __ovld __conv intel_sub_group_shuffle_xor( float x, uint c ); +float2 __ovld __conv intel_sub_group_shuffle_xor( float2 x, uint c ); +float3 __ovld __conv intel_sub_group_shuffle_xor( float3 x, uint c ); +float4 __ovld __conv intel_sub_group_shuffle_xor( float4 x, uint c ); +float8 __ovld __conv intel_sub_group_shuffle_xor( float8 x, uint c ); +float16 __ovld __conv intel_sub_group_shuffle_xor( float16 x, uint c ); + +int __ovld __conv intel_sub_group_shuffle_xor( int x, uint c ); +int2 __ovld __conv intel_sub_group_shuffle_xor( int2 x, uint c ); +int3 __ovld __conv intel_sub_group_shuffle_xor( int3 x, uint c ); +int4 __ovld __conv intel_sub_group_shuffle_xor( int4 x, uint c ); +int8 __ovld __conv intel_sub_group_shuffle_xor( int8 x, uint c ); +int16 __ovld __conv intel_sub_group_shuffle_xor( int16 x, uint c ); + +uint __ovld __conv intel_sub_group_shuffle_xor( uint x, uint c ); +uint2 __ovld __conv intel_sub_group_shuffle_xor( uint2 x, uint c ); +uint3 __ovld __conv intel_sub_group_shuffle_xor( uint3 x, uint c ); +uint4 __ovld __conv intel_sub_group_shuffle_xor( uint4 x, uint c ); +uint8 __ovld __conv intel_sub_group_shuffle_xor( uint8 x, uint c ); +uint16 __ovld __conv intel_sub_group_shuffle_xor( uint16 x, uint c ); + +long __ovld __conv intel_sub_group_shuffle_xor( long x, uint c ); +ulong __ovld __conv intel_sub_group_shuffle_xor( ulong x, uint c ); + +uint __ovld __conv intel_sub_group_block_read( read_only image2d_t image, int2 coord ); +uint2 __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, int2 coord ); +uint4 __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord ); +uint8 __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +uint __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord); +uint2 __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord); +uint4 __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord); +uint8 __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +uint __ovld __conv intel_sub_group_block_read( const __global uint* p ); +uint2 __ovld __conv intel_sub_group_block_read2( const __global uint* p ); +uint4 __ovld __conv intel_sub_group_block_read4( const __global uint* p ); +uint8 __ovld __conv intel_sub_group_block_read8( const __global uint* p ); + +void __ovld __conv intel_sub_group_block_write(write_only image2d_t image, int2 coord, uint data); +void __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, int2 coord, uint2 data); +void __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data); +void __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +void __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data); +void __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data); +void __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data); +void __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +void __ovld __conv intel_sub_group_block_write( __global uint* p, uint data ); +void __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data ); +void __ovld __conv intel_sub_group_block_write4( __global uint* p, uint4 data ); +void __ovld __conv intel_sub_group_block_write8( __global uint* p, uint8 data ); + +#ifdef cl_khr_fp16 +half __ovld __conv intel_sub_group_shuffle( half x, uint c ); +half __ovld __conv intel_sub_group_shuffle_down( half prev, half cur, uint c ); +half __ovld __conv intel_sub_group_shuffle_up( half prev, half cur, uint c ); +half __ovld __conv intel_sub_group_shuffle_xor( half x, uint c ); +#endif + +#if defined(cl_khr_fp64) +double __ovld __conv intel_sub_group_shuffle( double x, uint c ); +double __ovld __conv intel_sub_group_shuffle_down( double prev, double cur, uint c ); +double __ovld __conv intel_sub_group_shuffle_up( double prev, double cur, uint c ); +double __ovld __conv intel_sub_group_shuffle_xor( double x, uint c ); +#endif + +#endif //cl_intel_subgroups + +#if defined(cl_intel_subgroups_short) +short __ovld __conv intel_sub_group_broadcast( short x, uint sub_group_local_id ); +short2 __ovld __conv intel_sub_group_broadcast( short2 x, uint sub_group_local_id ); +short3 __ovld __conv intel_sub_group_broadcast( short3 x, uint sub_group_local_id ); +short4 __ovld __conv intel_sub_group_broadcast( short4 x, uint sub_group_local_id ); +short8 __ovld __conv intel_sub_group_broadcast( short8 x, uint sub_group_local_id ); + +ushort __ovld __conv intel_sub_group_broadcast( ushort x, uint sub_group_local_id ); +ushort2 __ovld __conv intel_sub_group_broadcast( ushort2 x, uint sub_group_local_id ); +ushort3 __ovld __conv intel_sub_group_broadcast( ushort3 x, uint sub_group_local_id ); +ushort4 __ovld __conv intel_sub_group_broadcast( ushort4 x, uint sub_group_local_id ); +ushort8 __ovld __conv intel_sub_group_broadcast( ushort8 x, uint sub_group_local_id ); + +short __ovld __conv intel_sub_group_shuffle( short x, uint c ); +short2 __ovld __conv intel_sub_group_shuffle( short2 x, uint c ); +short3 __ovld __conv intel_sub_group_shuffle( short3 x, uint c ); +short4 __ovld __conv intel_sub_group_shuffle( short4 x, uint c ); +short8 __ovld __conv intel_sub_group_shuffle( short8 x, uint c ); +short16 __ovld __conv intel_sub_group_shuffle( short16 x, uint c); + +ushort __ovld __conv intel_sub_group_shuffle( ushort x, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle( ushort2 x, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle( ushort3 x, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle( ushort4 x, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle( ushort8 x, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle( ushort16 x, uint c ); + +short __ovld __conv intel_sub_group_shuffle_down( short cur, short next, uint c ); +short2 __ovld __conv intel_sub_group_shuffle_down( short2 cur, short2 next, uint c ); +short3 __ovld __conv intel_sub_group_shuffle_down( short3 cur, short3 next, uint c ); +short4 __ovld __conv intel_sub_group_shuffle_down( short4 cur, short4 next, uint c ); +short8 __ovld __conv intel_sub_group_shuffle_down( short8 cur, short8 next, uint c ); +short16 __ovld __conv intel_sub_group_shuffle_down( short16 cur, short16 next, uint c ); + +ushort __ovld __conv intel_sub_group_shuffle_down( ushort cur, ushort next, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle_down( ushort2 cur, ushort2 next, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle_down( ushort3 cur, ushort3 next, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle_down( ushort4 cur, ushort4 next, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle_down( ushort8 cur, ushort8 next, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle_down( ushort16 cur, ushort16 next, uint c ); + +short __ovld __conv intel_sub_group_shuffle_up( short cur, short next, uint c ); +short2 __ovld __conv intel_sub_group_shuffle_up( short2 cur, short2 next, uint c ); +short3 __ovld __conv intel_sub_group_shuffle_up( short3 cur, short3 next, uint c ); +short4 __ovld __conv intel_sub_group_shuffle_up( short4 cur, short4 next, uint c ); +short8 __ovld __conv intel_sub_group_shuffle_up( short8 cur, short8 next, uint c ); +short16 __ovld __conv intel_sub_group_shuffle_up( short16 cur, short16 next, uint c ); + +ushort __ovld __conv intel_sub_group_shuffle_up( ushort cur, ushort next, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle_up( ushort2 cur, ushort2 next, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle_up( ushort3 cur, ushort3 next, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle_up( ushort4 cur, ushort4 next, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle_up( ushort8 cur, ushort8 next, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle_up( ushort16 cur, ushort16 next, uint c ); + +short __ovld __conv intel_sub_group_shuffle_xor( short x, uint c ); +short2 __ovld __conv intel_sub_group_shuffle_xor( short2 x, uint c ); +short3 __ovld __conv intel_sub_group_shuffle_xor( short3 x, uint c ); +short4 __ovld __conv intel_sub_group_shuffle_xor( short4 x, uint c ); +short8 __ovld __conv intel_sub_group_shuffle_xor( short8 x, uint c ); +short16 __ovld __conv intel_sub_group_shuffle_xor( short16 x, uint c ); + +ushort __ovld __conv intel_sub_group_shuffle_xor( ushort x, uint c ); +ushort2 __ovld __conv intel_sub_group_shuffle_xor( ushort2 x, uint c ); +ushort3 __ovld __conv intel_sub_group_shuffle_xor( ushort3 x, uint c ); +ushort4 __ovld __conv intel_sub_group_shuffle_xor( ushort4 x, uint c ); +ushort8 __ovld __conv intel_sub_group_shuffle_xor( ushort8 x, uint c ); +ushort16 __ovld __conv intel_sub_group_shuffle_xor( ushort16 x, uint c ); + +short __ovld __conv intel_sub_group_reduce_add( short x ); +ushort __ovld __conv intel_sub_group_reduce_add( ushort x ); +short __ovld __conv intel_sub_group_reduce_min( short x ); +ushort __ovld __conv intel_sub_group_reduce_min( ushort x ); +short __ovld __conv intel_sub_group_reduce_max( short x ); +ushort __ovld __conv intel_sub_group_reduce_max( ushort x ); + +short __ovld __conv intel_sub_group_scan_exclusive_add( short x ); +ushort __ovld __conv intel_sub_group_scan_exclusive_add( ushort x ); +short __ovld __conv intel_sub_group_scan_exclusive_min( short x ); +ushort __ovld __conv intel_sub_group_scan_exclusive_min( ushort x ); +short __ovld __conv intel_sub_group_scan_exclusive_max( short x ); +ushort __ovld __conv intel_sub_group_scan_exclusive_max( ushort x ); + +short __ovld __conv intel_sub_group_scan_inclusive_add( short x ); +ushort __ovld __conv intel_sub_group_scan_inclusive_add( ushort x ); +short __ovld __conv intel_sub_group_scan_inclusive_min( short x ); +ushort __ovld __conv intel_sub_group_scan_inclusive_min( ushort x ); +short __ovld __conv intel_sub_group_scan_inclusive_max( short x ); +ushort __ovld __conv intel_sub_group_scan_inclusive_max( ushort x ); + +uint __ovld __conv intel_sub_group_block_read_ui( read_only image2d_t image, int2 byte_coord ); +uint2 __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t image, int2 byte_coord ); +uint4 __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord ); +uint8 __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +uint __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord ); +uint2 __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord ); +uint4 __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord ); +uint8 __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord ); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +uint __ovld __conv intel_sub_group_block_read_ui( const __global uint* p ); +uint2 __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p ); +uint4 __ovld __conv intel_sub_group_block_read_ui4( const __global uint* p ); +uint8 __ovld __conv intel_sub_group_block_read_ui8( const __global uint* p ); + +void __ovld __conv intel_sub_group_block_write_ui( read_only image2d_t image, int2 byte_coord, uint data ); +void __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t image, int2 byte_coord, uint2 data ); +void __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data ); +void __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +void __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data ); +void __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data ); +void __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data ); +void __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data ); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +void __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data ); +void __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data ); +void __ovld __conv intel_sub_group_block_write_ui4( __global uint* p, uint4 data ); +void __ovld __conv intel_sub_group_block_write_ui8( __global uint* p, uint8 data ); + +ushort __ovld __conv intel_sub_group_block_read_us( read_only image2d_t image, int2 coord ); +ushort2 __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t image, int2 coord ); +ushort4 __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord ); +ushort8 __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord ); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +ushort __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord); +ushort2 __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord); +ushort4 __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord); +ushort8 __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +ushort __ovld __conv intel_sub_group_block_read_us( const __global ushort* p ); +ushort2 __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p ); +ushort4 __ovld __conv intel_sub_group_block_read_us4( const __global ushort* p ); +ushort8 __ovld __conv intel_sub_group_block_read_us8( const __global ushort* p ); + +void __ovld __conv intel_sub_group_block_write_us(write_only image2d_t image, int2 coord, ushort data); +void __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t image, int2 coord, ushort2 data); +void __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data); +void __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data); + +#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) +void __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort data); +void __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data); +void __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data); +void __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data); +#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0) + +void __ovld __conv intel_sub_group_block_write_us( __global ushort* p, ushort data ); +void __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data ); +void __ovld __conv intel_sub_group_block_write_us4( __global ushort* p, ushort4 data ); +void __ovld __conv intel_sub_group_block_write_us8( __global ushort* p, ushort8 data ); +#endif // cl_intel_subgroups_short + #ifdef cl_amd_media_ops uint __ovld amd_bitalign(uint a, uint b, uint c); uint2 __ovld amd_bitalign(uint2 a, uint2 b, uint2 c); diff --git a/c_headers/pmmintrin.h b/c_headers/pmmintrin.h index 559ece2e39..7ec08a1bcb 100644 --- a/c_headers/pmmintrin.h +++ b/c_headers/pmmintrin.h @@ -115,8 +115,8 @@ _mm_hsub_ps(__m128 __a, __m128 __b) return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); } -/// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit -/// vector of [4 x float] to float values stored in a 128-bit vector of +/// \brief Moves and duplicates odd-indexed values from a 128-bit vector +/// of [4 x float] to float values stored in a 128-bit vector of /// [4 x float]. /// /// \headerfile <x86intrin.h> @@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); } -/// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of +/// \brief Duplicates even-indexed values from a 128-bit vector of /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> diff --git a/c_headers/smmintrin.h b/c_headers/smmintrin.h index c2fa5a452b..e02775cea3 100644 --- a/c_headers/smmintrin.h +++ b/c_headers/smmintrin.h @@ -648,7 +648,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /// input vectors are used as an input for dot product; otherwise that input /// is treated as zero. Bits [1:0] determine which elements of the result /// will receive a copy of the final dot product, with bit [0] corresponding -/// to the lowest element and bit [3] corresponding to the highest element of +/// to the lowest element and bit [1] corresponding to the highest element of /// each [2 x double] vector. If a bit is set, the dot product is returned in /// the corresponding element; otherwise that element is set to zero. #define _mm_dp_pd(X, Y, M) __extension__ ({\ @@ -866,8 +866,8 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n /// Bits[3:0]: If any of these bits are set, the corresponding result /// element is cleared. -/// \returns A 128-bit vector of [4 x float] containing the copied single- -/// precision floating point elements from the operands. +/// \returns A 128-bit vector of [4 x float] containing the copied +/// single-precision floating point elements from the operands. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and diff --git a/c_headers/stdbool.h b/c_headers/stdbool.h index 0467893f34..5cb66b55d0 100644 --- a/c_headers/stdbool.h +++ b/c_headers/stdbool.h @@ -32,12 +32,15 @@ #define true 1 #define false 0 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__) -/* Define _Bool, bool, false, true as a GNU extension. */ +/* Define _Bool as a GNU extension. */ #define _Bool bool +#if __cplusplus < 201103L +/* For C++98, define bool, false, true as a GNU extension. */ #define bool bool #define false false #define true true #endif +#endif #define __bool_true_false_are_defined 1 diff --git a/c_headers/vaesintrin.h b/c_headers/vaesintrin.h new file mode 100644 index 0000000000..efbb8a5652 --- /dev/null +++ b/c_headers/vaesintrin.h @@ -0,0 +1,98 @@ +/*===------------------ vaesintrin.h - VAES intrinsics ---------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __VAESINTRIN_H +#define __VAESINTRIN_H + +/* Default attributes for YMM forms. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"))) + + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesenc_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesenc256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesenc_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesenc512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesdec_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesdec256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesdec_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesdec512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesenclast_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesenclast_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A, + (__v8di) __B); +} + + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_F + +#endif diff --git a/c_headers/vpclmulqdqintrin.h b/c_headers/vpclmulqdqintrin.h new file mode 100644 index 0000000000..21cda22210 --- /dev/null +++ b/c_headers/vpclmulqdqintrin.h @@ -0,0 +1,42 @@ +/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __VPCLMULQDQINTRIN_H +#define __VPCLMULQDQINTRIN_H + +#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \ + (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (char)(I)); }) + +#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \ + (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (char)(I)); }) + +#endif // __VPCLMULQDQINTRIN_H + diff --git a/c_headers/xmmintrin.h b/c_headers/xmmintrin.h index bbc2117b4e..279c0275d9 100644 --- a/c_headers/xmmintrin.h +++ b/c_headers/xmmintrin.h @@ -2035,9 +2035,11 @@ _mm_storer_ps(float *__p, __m128 __a) _mm_store_ps(__p, __a); } -#define _MM_HINT_T0 3 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 1 +#define _MM_HINT_ET0 7 +#define _MM_HINT_ET1 6 +#define _MM_HINT_T0 3 +#define _MM_HINT_T1 2 +#define _MM_HINT_T2 1 #define _MM_HINT_NTA 0 #ifndef _MSC_VER @@ -2068,7 +2070,8 @@ _mm_storer_ps(float *__p, __m128 __a) /// be generated. \n /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will /// be generated. -#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), 0, (sel))) +#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)(a), \ + ((sel) >> 2) & 1, (sel) & 0x3)) #endif /// \brief Stores a 64-bit integer in the specified aligned memory location. To |
