diff options
87 files changed, 19181 insertions, 16044 deletions
diff --git a/c_headers/__clang_cuda_builtin_vars.h b/c_headers/__clang_cuda_builtin_vars.h index 6f5eb9c78d..290c4b2984 100644 --- a/c_headers/__clang_cuda_builtin_vars.h +++ b/c_headers/__clang_cuda_builtin_vars.h @@ -54,7 +54,7 @@ struct dim3; #define __DELETE #endif -// Make sure nobody can create instances of the special varible types. nvcc +// Make sure nobody can create instances of the special variable types. nvcc // also disallows taking address of special variables, so we disable address-of // operator as well. #define __CUDA_DISALLOW_BUILTINVAR_ACCESS(TypeName) \ diff --git a/c_headers/__clang_cuda_device_functions.h b/c_headers/__clang_cuda_device_functions.h new file mode 100644 index 0000000000..67bbc68b16 --- /dev/null +++ b/c_headers/__clang_cuda_device_functions.h @@ -0,0 +1,1768 @@ +/*===---- __clang_cuda_device_functions.h - CUDA runtime support -----------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_DEVICE_FUNCTIONS_H__ +#define __CLANG_CUDA_DEVICE_FUNCTIONS_H__ + +#if CUDA_VERSION < 9000 +#error This file is intended to be used with CUDA-9+ only. +#endif + +// __DEVICE__ is a helper macro with common set of attributes for the wrappers +// we implement in this file. We need static in order to avoid emitting unused +// functions and __forceinline__ helps inlining these wrappers at -O1. +#pragma push_macro("__DEVICE__") +#define __DEVICE__ static __device__ __forceinline__ + +// libdevice provides fast low precision and slow full-recision implementations +// for some functions. Which one gets selected depends on +// __CLANG_CUDA_APPROX_TRANSCENDENTALS__ which gets defined by clang if +// -ffast-math or -fcuda-approx-transcendentals are in effect. +#pragma push_macro("__FAST_OR_SLOW") +#if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__) +#define __FAST_OR_SLOW(fast, slow) fast +#else +#define __FAST_OR_SLOW(fast, slow) slow +#endif + +__DEVICE__ int __all(int __a) { return __nvvm_vote_all(__a); } +__DEVICE__ int __any(int __a) { return __nvvm_vote_any(__a); } +__DEVICE__ unsigned int __ballot(int __a) { return __nvvm_vote_ballot(__a); } +__DEVICE__ unsigned int __brev(unsigned int __a) { return __nv_brev(__a); } +__DEVICE__ unsigned long long __brevll(unsigned long long __a) { + return __nv_brevll(__a); +} +__DEVICE__ void __brkpt() { asm volatile("brkpt;"); } +__DEVICE__ void __brkpt(int __a) { __brkpt(); } +__DEVICE__ unsigned int __byte_perm(unsigned int __a, unsigned int __b, + unsigned int __c) { + return __nv_byte_perm(__a, __b, __c); +} +__DEVICE__ int __clz(int __a) { return __nv_clz(__a); } +__DEVICE__ int __clzll(long long __a) { return __nv_clzll(__a); } +__DEVICE__ float __cosf(float __a) { return __nv_fast_cosf(__a); } +__DEVICE__ double __dAtomicAdd(double *__p, double __v) { + return __nvvm_atom_add_gen_d(__p, __v); +} +__DEVICE__ double __dAtomicAdd_block(double *__p, double __v) { + return __nvvm_atom_cta_add_gen_d(__p, __v); +} +__DEVICE__ double __dAtomicAdd_system(double *__p, double __v) { + return __nvvm_atom_sys_add_gen_d(__p, __v); +} +__DEVICE__ double __dadd_rd(double __a, double __b) { + return __nv_dadd_rd(__a, __b); +} +__DEVICE__ double __dadd_rn(double __a, double __b) { + return __nv_dadd_rn(__a, __b); +} +__DEVICE__ double __dadd_ru(double __a, double __b) { + return __nv_dadd_ru(__a, __b); +} +__DEVICE__ double __dadd_rz(double __a, double __b) { + return __nv_dadd_rz(__a, __b); +} +__DEVICE__ double __ddiv_rd(double __a, double __b) { + return __nv_ddiv_rd(__a, __b); +} +__DEVICE__ double __ddiv_rn(double __a, double __b) { + return __nv_ddiv_rn(__a, __b); +} +__DEVICE__ double __ddiv_ru(double __a, double __b) { + return __nv_ddiv_ru(__a, __b); +} +__DEVICE__ double __ddiv_rz(double __a, double __b) { + return __nv_ddiv_rz(__a, __b); +} +__DEVICE__ double __dmul_rd(double __a, double __b) { + return __nv_dmul_rd(__a, __b); +} +__DEVICE__ double __dmul_rn(double __a, double __b) { + return __nv_dmul_rn(__a, __b); +} +__DEVICE__ double __dmul_ru(double __a, double __b) { + return __nv_dmul_ru(__a, __b); +} +__DEVICE__ double __dmul_rz(double __a, double __b) { + return __nv_dmul_rz(__a, __b); +} +__DEVICE__ float __double2float_rd(double __a) { + return __nv_double2float_rd(__a); +} +__DEVICE__ float __double2float_rn(double __a) { + return __nv_double2float_rn(__a); +} +__DEVICE__ float __double2float_ru(double __a) { + return __nv_double2float_ru(__a); +} +__DEVICE__ float __double2float_rz(double __a) { + return __nv_double2float_rz(__a); +} +__DEVICE__ int __double2hiint(double __a) { return __nv_double2hiint(__a); } +__DEVICE__ int __double2int_rd(double __a) { return __nv_double2int_rd(__a); } +__DEVICE__ int __double2int_rn(double __a) { return __nv_double2int_rn(__a); } +__DEVICE__ int __double2int_ru(double __a) { return __nv_double2int_ru(__a); } +__DEVICE__ int __double2int_rz(double __a) { return __nv_double2int_rz(__a); } +__DEVICE__ long long __double2ll_rd(double __a) { + return __nv_double2ll_rd(__a); +} +__DEVICE__ long long __double2ll_rn(double __a) { + return __nv_double2ll_rn(__a); +} +__DEVICE__ long long __double2ll_ru(double __a) { + return __nv_double2ll_ru(__a); +} +__DEVICE__ long long __double2ll_rz(double __a) { + return __nv_double2ll_rz(__a); +} +__DEVICE__ int __double2loint(double __a) { return __nv_double2loint(__a); } +__DEVICE__ unsigned int __double2uint_rd(double __a) { + return __nv_double2uint_rd(__a); +} +__DEVICE__ unsigned int __double2uint_rn(double __a) { + return __nv_double2uint_rn(__a); +} +__DEVICE__ unsigned int __double2uint_ru(double __a) { + return __nv_double2uint_ru(__a); +} +__DEVICE__ unsigned int __double2uint_rz(double __a) { + return __nv_double2uint_rz(__a); +} +__DEVICE__ unsigned long long __double2ull_rd(double __a) { + return __nv_double2ull_rd(__a); +} +__DEVICE__ unsigned long long __double2ull_rn(double __a) { + return __nv_double2ull_rn(__a); +} +__DEVICE__ unsigned long long __double2ull_ru(double __a) { + return __nv_double2ull_ru(__a); +} +__DEVICE__ unsigned long long __double2ull_rz(double __a) { + return __nv_double2ull_rz(__a); +} +__DEVICE__ long long __double_as_longlong(double __a) { + return __nv_double_as_longlong(__a); +} +__DEVICE__ double __drcp_rd(double __a) { return __nv_drcp_rd(__a); } +__DEVICE__ double __drcp_rn(double __a) { return __nv_drcp_rn(__a); } +__DEVICE__ double __drcp_ru(double __a) { return __nv_drcp_ru(__a); } +__DEVICE__ double __drcp_rz(double __a) { return __nv_drcp_rz(__a); } +__DEVICE__ double __dsqrt_rd(double __a) { return __nv_dsqrt_rd(__a); } +__DEVICE__ double __dsqrt_rn(double __a) { return __nv_dsqrt_rn(__a); } +__DEVICE__ double __dsqrt_ru(double __a) { return __nv_dsqrt_ru(__a); } +__DEVICE__ double __dsqrt_rz(double __a) { return __nv_dsqrt_rz(__a); } +__DEVICE__ double __dsub_rd(double __a, double __b) { + return __nv_dsub_rd(__a, __b); +} +__DEVICE__ double __dsub_rn(double __a, double __b) { + return __nv_dsub_rn(__a, __b); +} +__DEVICE__ double __dsub_ru(double __a, double __b) { + return __nv_dsub_ru(__a, __b); +} +__DEVICE__ double __dsub_rz(double __a, double __b) { + return __nv_dsub_rz(__a, __b); +} +__DEVICE__ float __exp10f(float __a) { return __nv_fast_exp10f(__a); } +__DEVICE__ float __expf(float __a) { return __nv_fast_expf(__a); } +__DEVICE__ float __fAtomicAdd(float *__p, float __v) { + return __nvvm_atom_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicAdd_block(float *__p, float __v) { + return __nvvm_atom_cta_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicAdd_system(float *__p, float __v) { + return __nvvm_atom_sys_add_gen_f(__p, __v); +} +__DEVICE__ float __fAtomicExch(float *__p, float __v) { + return __nv_int_as_float( + __nvvm_atom_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fAtomicExch_block(float *__p, float __v) { + return __nv_int_as_float( + __nvvm_atom_cta_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fAtomicExch_system(float *__p, float __v) { + return __nv_int_as_float( + __nvvm_atom_sys_xchg_gen_i((int *)__p, __nv_float_as_int(__v))); +} +__DEVICE__ float __fadd_rd(float __a, float __b) { + return __nv_fadd_rd(__a, __b); +} +__DEVICE__ float __fadd_rn(float __a, float __b) { + return __nv_fadd_rn(__a, __b); +} +__DEVICE__ float __fadd_ru(float __a, float __b) { + return __nv_fadd_ru(__a, __b); +} +__DEVICE__ float __fadd_rz(float __a, float __b) { + return __nv_fadd_rz(__a, __b); +} +__DEVICE__ float __fdiv_rd(float __a, float __b) { + return __nv_fdiv_rd(__a, __b); +} +__DEVICE__ float __fdiv_rn(float __a, float __b) { + return __nv_fdiv_rn(__a, __b); +} +__DEVICE__ float __fdiv_ru(float __a, float __b) { + return __nv_fdiv_ru(__a, __b); +} +__DEVICE__ float __fdiv_rz(float __a, float __b) { + return __nv_fdiv_rz(__a, __b); +} +__DEVICE__ float __fdividef(float __a, float __b) { + return __nv_fast_fdividef(__a, __b); +} +__DEVICE__ int __ffs(int __a) { return __nv_ffs(__a); } +__DEVICE__ int __ffsll(long long __a) { return __nv_ffsll(__a); } +__DEVICE__ int __finite(double __a) { return __nv_isfinited(__a); } +__DEVICE__ int __finitef(float __a) { return __nv_finitef(__a); } +__DEVICE__ int __float2int_rd(float __a) { return __nv_float2int_rd(__a); } +__DEVICE__ int __float2int_rn(float __a) { return __nv_float2int_rn(__a); } +__DEVICE__ int __float2int_ru(float __a) { return __nv_float2int_ru(__a); } +__DEVICE__ int __float2int_rz(float __a) { return __nv_float2int_rz(__a); } +__DEVICE__ long long __float2ll_rd(float __a) { return __nv_float2ll_rd(__a); } +__DEVICE__ long long __float2ll_rn(float __a) { return __nv_float2ll_rn(__a); } +__DEVICE__ long long __float2ll_ru(float __a) { return __nv_float2ll_ru(__a); } +__DEVICE__ long long __float2ll_rz(float __a) { return __nv_float2ll_rz(__a); } +__DEVICE__ unsigned int __float2uint_rd(float __a) { + return __nv_float2uint_rd(__a); +} +__DEVICE__ unsigned int __float2uint_rn(float __a) { + return __nv_float2uint_rn(__a); +} +__DEVICE__ unsigned int __float2uint_ru(float __a) { + return __nv_float2uint_ru(__a); +} +__DEVICE__ unsigned int __float2uint_rz(float __a) { + return __nv_float2uint_rz(__a); +} +__DEVICE__ unsigned long long __float2ull_rd(float __a) { + return __nv_float2ull_rd(__a); +} +__DEVICE__ unsigned long long __float2ull_rn(float __a) { + return __nv_float2ull_rn(__a); +} +__DEVICE__ unsigned long long __float2ull_ru(float __a) { + return __nv_float2ull_ru(__a); +} +__DEVICE__ unsigned long long __float2ull_rz(float __a) { + return __nv_float2ull_rz(__a); +} +__DEVICE__ int __float_as_int(float __a) { return __nv_float_as_int(__a); } +__DEVICE__ unsigned int __float_as_uint(float __a) { + return __nv_float_as_uint(__a); +} +__DEVICE__ double __fma_rd(double __a, double __b, double __c) { + return __nv_fma_rd(__a, __b, __c); +} +__DEVICE__ double __fma_rn(double __a, double __b, double __c) { + return __nv_fma_rn(__a, __b, __c); +} +__DEVICE__ double __fma_ru(double __a, double __b, double __c) { + return __nv_fma_ru(__a, __b, __c); +} +__DEVICE__ double __fma_rz(double __a, double __b, double __c) { + return __nv_fma_rz(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rd(float __a, float __b, float __c) { + return __nv_fmaf_ieee_rd(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rn(float __a, float __b, float __c) { + return __nv_fmaf_ieee_rn(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_ru(float __a, float __b, float __c) { + return __nv_fmaf_ieee_ru(__a, __b, __c); +} +__DEVICE__ float __fmaf_ieee_rz(float __a, float __b, float __c) { + return __nv_fmaf_ieee_rz(__a, __b, __c); +} +__DEVICE__ float __fmaf_rd(float __a, float __b, float __c) { + return __nv_fmaf_rd(__a, __b, __c); +} +__DEVICE__ float __fmaf_rn(float __a, float __b, float __c) { + return __nv_fmaf_rn(__a, __b, __c); +} +__DEVICE__ float __fmaf_ru(float __a, float __b, float __c) { + return __nv_fmaf_ru(__a, __b, __c); +} +__DEVICE__ float __fmaf_rz(float __a, float __b, float __c) { + return __nv_fmaf_rz(__a, __b, __c); +} +__DEVICE__ float __fmul_rd(float __a, float __b) { + return __nv_fmul_rd(__a, __b); +} +__DEVICE__ float __fmul_rn(float __a, float __b) { + return __nv_fmul_rn(__a, __b); +} +__DEVICE__ float __fmul_ru(float __a, float __b) { + return __nv_fmul_ru(__a, __b); +} +__DEVICE__ float __fmul_rz(float __a, float __b) { + return __nv_fmul_rz(__a, __b); +} +__DEVICE__ float __frcp_rd(float __a) { return __nv_frcp_rd(__a); } +__DEVICE__ float __frcp_rn(float __a) { return __nv_frcp_rn(__a); } +__DEVICE__ float __frcp_ru(float __a) { return __nv_frcp_ru(__a); } +__DEVICE__ float __frcp_rz(float __a) { return __nv_frcp_rz(__a); } +__DEVICE__ float __frsqrt_rn(float __a) { return __nv_frsqrt_rn(__a); } +__DEVICE__ float __fsqrt_rd(float __a) { return __nv_fsqrt_rd(__a); } +__DEVICE__ float __fsqrt_rn(float __a) { return __nv_fsqrt_rn(__a); } +__DEVICE__ float __fsqrt_ru(float __a) { return __nv_fsqrt_ru(__a); } +__DEVICE__ float __fsqrt_rz(float __a) { return __nv_fsqrt_rz(__a); } +__DEVICE__ float __fsub_rd(float __a, float __b) { + return __nv_fsub_rd(__a, __b); +} +__DEVICE__ float __fsub_rn(float __a, float __b) { + return __nv_fsub_rn(__a, __b); +} +__DEVICE__ float __fsub_ru(float __a, float __b) { + return __nv_fsub_ru(__a, __b); +} +__DEVICE__ float __fsub_rz(float __a, float __b) { + return __nv_fsub_rz(__a, __b); +} +__DEVICE__ int __hadd(int __a, int __b) { return __nv_hadd(__a, __b); } +__DEVICE__ double __hiloint2double(int __a, int __b) { + return __nv_hiloint2double(__a, __b); +} +__DEVICE__ int __iAtomicAdd(int *__p, int __v) { + return __nvvm_atom_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAdd_block(int *__p, int __v) { + __nvvm_atom_cta_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAdd_system(int *__p, int __v) { + __nvvm_atom_sys_add_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd(int *__p, int __v) { + return __nvvm_atom_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd_block(int *__p, int __v) { + return __nvvm_atom_cta_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicAnd_system(int *__p, int __v) { + return __nvvm_atom_sys_and_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicCAS(int *__p, int __cmp, int __v) { + return __nvvm_atom_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicCAS_block(int *__p, int __cmp, int __v) { + return __nvvm_atom_cta_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicCAS_system(int *__p, int __cmp, int __v) { + return __nvvm_atom_sys_cas_gen_i(__p, __cmp, __v); +} +__DEVICE__ int __iAtomicExch(int *__p, int __v) { + return __nvvm_atom_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicExch_block(int *__p, int __v) { + return __nvvm_atom_cta_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicExch_system(int *__p, int __v) { + return __nvvm_atom_sys_xchg_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax(int *__p, int __v) { + return __nvvm_atom_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax_block(int *__p, int __v) { + return __nvvm_atom_cta_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMax_system(int *__p, int __v) { + return __nvvm_atom_sys_max_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin(int *__p, int __v) { + return __nvvm_atom_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin_block(int *__p, int __v) { + return __nvvm_atom_cta_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicMin_system(int *__p, int __v) { + return __nvvm_atom_sys_min_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr(int *__p, int __v) { + return __nvvm_atom_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr_block(int *__p, int __v) { + return __nvvm_atom_cta_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicOr_system(int *__p, int __v) { + return __nvvm_atom_sys_or_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor(int *__p, int __v) { + return __nvvm_atom_xor_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor_block(int *__p, int __v) { + return __nvvm_atom_cta_xor_gen_i(__p, __v); +} +__DEVICE__ int __iAtomicXor_system(int *__p, int __v) { + return __nvvm_atom_sys_xor_gen_i(__p, __v); +} +__DEVICE__ long long __illAtomicMax(long long *__p, long long __v) { + return __nvvm_atom_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMax_block(long long *__p, long long __v) { + return __nvvm_atom_cta_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMax_system(long long *__p, long long __v) { + return __nvvm_atom_sys_max_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin(long long *__p, long long __v) { + return __nvvm_atom_min_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin_block(long long *__p, long long __v) { + return __nvvm_atom_cta_min_gen_ll(__p, __v); +} +__DEVICE__ long long __illAtomicMin_system(long long *__p, long long __v) { + return __nvvm_atom_sys_min_gen_ll(__p, __v); +} +__DEVICE__ double __int2double_rn(int __a) { return __nv_int2double_rn(__a); } +__DEVICE__ float __int2float_rd(int __a) { return __nv_int2float_rd(__a); } +__DEVICE__ float __int2float_rn(int __a) { return __nv_int2float_rn(__a); } +__DEVICE__ float __int2float_ru(int __a) { return __nv_int2float_ru(__a); } +__DEVICE__ float __int2float_rz(int __a) { return __nv_int2float_rz(__a); } +__DEVICE__ float __int_as_float(int __a) { return __nv_int_as_float(__a); } +__DEVICE__ int __isfinited(double __a) { return __nv_isfinited(__a); } +__DEVICE__ int __isinf(double __a) { return __nv_isinfd(__a); } +__DEVICE__ int __isinff(float __a) { return __nv_isinff(__a); } +__DEVICE__ int __isnan(double __a) { return __nv_isnand(__a); } +__DEVICE__ int __isnanf(float __a) { return __nv_isnanf(__a); } +__DEVICE__ double __ll2double_rd(long long __a) { + return __nv_ll2double_rd(__a); +} +__DEVICE__ double __ll2double_rn(long long __a) { + return __nv_ll2double_rn(__a); +} +__DEVICE__ double __ll2double_ru(long long __a) { + return __nv_ll2double_ru(__a); +} +__DEVICE__ double __ll2double_rz(long long __a) { + return __nv_ll2double_rz(__a); +} +__DEVICE__ float __ll2float_rd(long long __a) { return __nv_ll2float_rd(__a); } +__DEVICE__ float __ll2float_rn(long long __a) { return __nv_ll2float_rn(__a); } +__DEVICE__ float __ll2float_ru(long long __a) { return __nv_ll2float_ru(__a); } +__DEVICE__ float __ll2float_rz(long long __a) { return __nv_ll2float_rz(__a); } +__DEVICE__ long long __llAtomicAnd(long long *__p, long long __v) { + return __nvvm_atom_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicAnd_block(long long *__p, long long __v) { + return __nvvm_atom_cta_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicAnd_system(long long *__p, long long __v) { + return __nvvm_atom_sys_and_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr(long long *__p, long long __v) { + return __nvvm_atom_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr_block(long long *__p, long long __v) { + return __nvvm_atom_cta_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicOr_system(long long *__p, long long __v) { + return __nvvm_atom_sys_or_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor(long long *__p, long long __v) { + return __nvvm_atom_xor_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor_block(long long *__p, long long __v) { + return __nvvm_atom_cta_xor_gen_ll(__p, __v); +} +__DEVICE__ long long __llAtomicXor_system(long long *__p, long long __v) { + return __nvvm_atom_sys_xor_gen_ll(__p, __v); +} +__DEVICE__ float __log10f(float __a) { return __nv_fast_log10f(__a); } +__DEVICE__ float __log2f(float __a) { return __nv_fast_log2f(__a); } +__DEVICE__ float __logf(float __a) { return __nv_fast_logf(__a); } +__DEVICE__ double __longlong_as_double(long long __a) { + return __nv_longlong_as_double(__a); +} +__DEVICE__ int __mul24(int __a, int __b) { return __nv_mul24(__a, __b); } +__DEVICE__ long long __mul64hi(long long __a, long long __b) { + return __nv_mul64hi(__a, __b); +} +__DEVICE__ int __mulhi(int __a, int __b) { return __nv_mulhi(__a, __b); } +__DEVICE__ unsigned int __pm0(void) { return __nvvm_read_ptx_sreg_pm0(); } +__DEVICE__ unsigned int __pm1(void) { return __nvvm_read_ptx_sreg_pm1(); } +__DEVICE__ unsigned int __pm2(void) { return __nvvm_read_ptx_sreg_pm2(); } +__DEVICE__ unsigned int __pm3(void) { return __nvvm_read_ptx_sreg_pm3(); } +__DEVICE__ int __popc(int __a) { return __nv_popc(__a); } +__DEVICE__ int __popcll(long long __a) { return __nv_popcll(__a); } +__DEVICE__ float __powf(float __a, float __b) { + return __nv_fast_powf(__a, __b); +} + +// Parameter must have a known integer value. +#define __prof_trigger(__a) asm __volatile__("pmevent \t%0;" ::"i"(__a)) +__DEVICE__ int __rhadd(int __a, int __b) { return __nv_rhadd(__a, __b); } +__DEVICE__ unsigned int __sad(int __a, int __b, unsigned int __c) { + return __nv_sad(__a, __b, __c); +} +__DEVICE__ float __saturatef(float __a) { return __nv_saturatef(__a); } +__DEVICE__ int __signbitd(double __a) { return __nv_signbitd(__a); } +__DEVICE__ int __signbitf(float __a) { return __nv_signbitf(__a); } +__DEVICE__ void __sincosf(float __a, float *__sptr, float *__cptr) { + return __nv_fast_sincosf(__a, __sptr, __cptr); +} +__DEVICE__ float __sinf(float __a) { return __nv_fast_sinf(__a); } +__DEVICE__ int __syncthreads_and(int __a) { return __nvvm_bar0_and(__a); } +__DEVICE__ int __syncthreads_count(int __a) { return __nvvm_bar0_popc(__a); } +__DEVICE__ int __syncthreads_or(int __a) { return __nvvm_bar0_or(__a); } +__DEVICE__ float __tanf(float __a) { return __nv_fast_tanf(__a); } +__DEVICE__ void __threadfence(void) { __nvvm_membar_gl(); } +__DEVICE__ void __threadfence_block(void) { __nvvm_membar_cta(); }; +__DEVICE__ void __threadfence_system(void) { __nvvm_membar_sys(); }; +__DEVICE__ void __trap(void) { asm volatile("trap;"); } +__DEVICE__ unsigned int __uAtomicAdd(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_add_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAdd_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_add_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAdd_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_add_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_and_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_and_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicAnd_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_and_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicCAS(unsigned int *__p, unsigned int __cmp, + unsigned int __v) { + return __nvvm_atom_cas_gen_i((int *)__p, __cmp, __v); +} +__DEVICE__ unsigned int +__uAtomicCAS_block(unsigned int *__p, unsigned int __cmp, unsigned int __v) { + return __nvvm_atom_cta_cas_gen_i((int *)__p, __cmp, __v); +} +__DEVICE__ unsigned int +__uAtomicCAS_system(unsigned int *__p, unsigned int __cmp, unsigned int __v) { + return __nvvm_atom_sys_cas_gen_i((int *)__p, __cmp, __v); +} +__DEVICE__ unsigned int __uAtomicDec(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicDec_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicDec_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_dec_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_xchg_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_xchg_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicExch_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_xchg_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicInc_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_inc_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMax_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_max_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicMin_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_min_gen_ui(__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_or_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr_block(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_cta_or_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicOr_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_or_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicXor(unsigned int *__p, unsigned int __v) { + return __nvvm_atom_xor_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicXor_block(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_cta_xor_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uAtomicXor_system(unsigned int *__p, + unsigned int __v) { + return __nvvm_atom_sys_xor_gen_i((int *)__p, __v); +} +__DEVICE__ unsigned int __uhadd(unsigned int __a, unsigned int __b) { + return __nv_uhadd(__a, __b); +} +__DEVICE__ double __uint2double_rn(unsigned int __a) { + return __nv_uint2double_rn(__a); +} +__DEVICE__ float __uint2float_rd(unsigned int __a) { + return __nv_uint2float_rd(__a); +} +__DEVICE__ float __uint2float_rn(unsigned int __a) { + return __nv_uint2float_rn(__a); +} +__DEVICE__ float __uint2float_ru(unsigned int __a) { + return __nv_uint2float_ru(__a); +} +__DEVICE__ float __uint2float_rz(unsigned int __a) { + return __nv_uint2float_rz(__a); +} +__DEVICE__ float __uint_as_float(unsigned int __a) { + return __nv_uint_as_float(__a); +} // +__DEVICE__ double __ull2double_rd(unsigned long long __a) { + return __nv_ull2double_rd(__a); +} +__DEVICE__ double __ull2double_rn(unsigned long long __a) { + return __nv_ull2double_rn(__a); +} +__DEVICE__ double __ull2double_ru(unsigned long long __a) { + return __nv_ull2double_ru(__a); +} +__DEVICE__ double __ull2double_rz(unsigned long long __a) { + return __nv_ull2double_rz(__a); +} +__DEVICE__ float __ull2float_rd(unsigned long long __a) { + return __nv_ull2float_rd(__a); +} +__DEVICE__ float __ull2float_rn(unsigned long long __a) { + return __nv_ull2float_rn(__a); +} +__DEVICE__ float __ull2float_ru(unsigned long long __a) { + return __nv_ull2float_ru(__a); +} +__DEVICE__ float __ull2float_rz(unsigned long long __a) { + return __nv_ull2float_rz(__a); +} +__DEVICE__ unsigned long long __ullAtomicAdd(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_add_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAdd_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_add_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAdd_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_add_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_and_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_and_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicAnd_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_and_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_cas_gen_ll((long long *)__p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS_block(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_cta_cas_gen_ll((long long *)__p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicCAS_system(unsigned long long *__p, + unsigned long long __cmp, + unsigned long long __v) { + return __nvvm_atom_sys_cas_gen_ll((long long *)__p, __cmp, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_xchg_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_xchg_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicExch_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_xchg_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMax_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_max_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicMin_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_min_gen_ull(__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_or_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_or_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicOr_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_or_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_xor_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor_block(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_cta_xor_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned long long __ullAtomicXor_system(unsigned long long *__p, + unsigned long long __v) { + return __nvvm_atom_sys_xor_gen_ll((long long *)__p, __v); +} +__DEVICE__ unsigned int __umul24(unsigned int __a, unsigned int __b) { + return __nv_umul24(__a, __b); +} +__DEVICE__ unsigned long long __umul64hi(unsigned long long __a, + unsigned long long __b) { + return __nv_umul64hi(__a, __b); +} +__DEVICE__ unsigned int __umulhi(unsigned int __a, unsigned int __b) { + return __nv_umulhi(__a, __b); +} +__DEVICE__ unsigned int __urhadd(unsigned int __a, unsigned int __b) { + return __nv_urhadd(__a, __b); +} +__DEVICE__ unsigned int __usad(unsigned int __a, unsigned int __b, + unsigned int __c) { + return __nv_usad(__a, __b, __c); +} + +#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 +__DEVICE__ unsigned int __vabs2(unsigned int __a) { return __nv_vabs2(__a); } +__DEVICE__ unsigned int __vabs4(unsigned int __a) { return __nv_vabs4(__a); } +__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffs2(__a, __b); +} +__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffs4(__a, __b); +} +__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffu2(__a, __b); +} +__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { + return __nv_vabsdiffu4(__a, __b); +} +__DEVICE__ unsigned int __vabsss2(unsigned int __a) { + return __nv_vabsss2(__a); +} +__DEVICE__ unsigned int __vabsss4(unsigned int __a) { + return __nv_vabsss4(__a); +} +__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { + return __nv_vadd2(__a, __b); +} +__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { + return __nv_vadd4(__a, __b); +} +__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { + return __nv_vaddss2(__a, __b); +} +__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { + return __nv_vaddss4(__a, __b); +} +__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { + return __nv_vaddus2(__a, __b); +} +__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { + return __nv_vaddus4(__a, __b); +} +__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { + return __nv_vavgs2(__a, __b); +} +__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { + return __nv_vavgs4(__a, __b); +} +__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { + return __nv_vavgu2(__a, __b); +} +__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { + return __nv_vavgu4(__a, __b); +} +__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { + return __nv_vcmpeq2(__a, __b); +} +__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { + return __nv_vcmpeq4(__a, __b); +} +__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { + return __nv_vcmpges2(__a, __b); +} +__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { + return __nv_vcmpges4(__a, __b); +} +__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpgeu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpgeu4(__a, __b); +} +__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { + return __nv_vcmpgts2(__a, __b); +} +__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { + return __nv_vcmpgts4(__a, __b); +} +__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpgtu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpgtu4(__a, __b); +} +__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { + return __nv_vcmples2(__a, __b); +} +__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { + return __nv_vcmples4(__a, __b); +} +__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpleu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpleu4(__a, __b); +} +__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { + return __nv_vcmplts2(__a, __b); +} +__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { + return __nv_vcmplts4(__a, __b); +} +__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { + return __nv_vcmpltu2(__a, __b); +} +__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { + return __nv_vcmpltu4(__a, __b); +} +__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { + return __nv_vcmpne2(__a, __b); +} +__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { + return __nv_vcmpne4(__a, __b); +} +__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { + return __nv_vhaddu2(__a, __b); +} +__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { + return __nv_vhaddu4(__a, __b); +} +__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { + return __nv_vmaxs2(__a, __b); +} +__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { + return __nv_vmaxs4(__a, __b); +} +__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { + return __nv_vmaxu2(__a, __b); +} +__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { + return __nv_vmaxu4(__a, __b); +} +__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { + return __nv_vmins2(__a, __b); +} +__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { + return __nv_vmins4(__a, __b); +} +__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { + return __nv_vminu2(__a, __b); +} +__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { + return __nv_vminu4(__a, __b); +} +__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __nv_vneg2(__a); } +__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __nv_vneg4(__a); } +__DEVICE__ unsigned int __vnegss2(unsigned int __a) { + return __nv_vnegss2(__a); +} +__DEVICE__ unsigned int __vnegss4(unsigned int __a) { + return __nv_vnegss4(__a); +} +__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { + return __nv_vsads2(__a, __b); +} +__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { + return __nv_vsads4(__a, __b); +} +__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { + return __nv_vsadu2(__a, __b); +} +__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { + return __nv_vsadu4(__a, __b); +} +__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { + return __nv_vseteq2(__a, __b); +} +__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { + return __nv_vseteq4(__a, __b); +} +__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { + return __nv_vsetges2(__a, __b); +} +__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { + return __nv_vsetges4(__a, __b); +} +__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { + return __nv_vsetgeu2(__a, __b); +} +__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { + return __nv_vsetgeu4(__a, __b); +} +__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { + return __nv_vsetgts2(__a, __b); +} +__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { + return __nv_vsetgts4(__a, __b); +} +__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { + return __nv_vsetgtu2(__a, __b); +} +__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { + return __nv_vsetgtu4(__a, __b); +} +__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { + return __nv_vsetles2(__a, __b); +} +__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { + return __nv_vsetles4(__a, __b); +} +__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { + return __nv_vsetleu2(__a, __b); +} +__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { + return __nv_vsetleu4(__a, __b); +} +__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { + return __nv_vsetlts2(__a, __b); +} +__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { + return __nv_vsetlts4(__a, __b); +} +__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { + return __nv_vsetltu2(__a, __b); +} +__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { + return __nv_vsetltu4(__a, __b); +} +__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { + return __nv_vsetne2(__a, __b); +} +__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { + return __nv_vsetne4(__a, __b); +} +__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { + return __nv_vsub2(__a, __b); +} +__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { + return __nv_vsub4(__a, __b); +} +__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { + return __nv_vsubss2(__a, __b); +} +__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { + return __nv_vsubss4(__a, __b); +} +__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { + return __nv_vsubus2(__a, __b); +} +__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { + return __nv_vsubus4(__a, __b); +} +#else // CUDA_VERSION >= 9020 +// CUDA no longer provides inline assembly (or bitcode) implementation of these +// functions, so we have to reimplment them. The implementation is naive and is +// not optimized for performance. + +// Helper function to convert N-bit boolean subfields into all-0 or all-1. +// E.g. __bool2mask(0x01000100,8) -> 0xff00ff00 +// __bool2mask(0x00010000,16) -> 0xffff0000 +__DEVICE__ unsigned int __bool2mask(unsigned int __a, int shift) { + return (__a << shift) - __a; +} +__DEVICE__ unsigned int __vabs2(unsigned int __a) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabs4(unsigned int __a) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffs2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vabsdiffs4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff2.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsdiffu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff4.u32.u32.u32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss2(unsigned int __a) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vabsss4(unsigned int __a) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(0), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vadd4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddss4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd2.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vaddus4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vadd4.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgs2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vavrg2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgs4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vavrg4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vavrg2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vavgu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vavrg4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vseteq2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpeq2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vseteq2(__a, __b), 16); +} +__DEVICE__ unsigned int __vseteq4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.u32.u32.eq %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpeq4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vseteq4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetges2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpges2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetges2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetges4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.s32.s32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpges4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetges4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgeu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgeu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgeu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgeu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.u32.u32.ge %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgeu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgeu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgts2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgts2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgts2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgts4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.s32.s32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgts4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgts4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetgtu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgtu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgtu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetgtu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.u32.u32.gt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpgtu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetgtu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetles2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmples2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetles2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetles4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.s32.s32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmples4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetles4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetleu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpleu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetleu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetleu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.u32.u32.le %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpleu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetleu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetlts2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmplts2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetlts2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetlts4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.s32.s32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmplts4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetlts4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetltu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpltu2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetltu2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetltu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.u32.u32.lt %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpltu4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetltu4(__a, __b), 8); +} +__DEVICE__ unsigned int __vsetne2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset2.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpne2(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetne2(__a, __b), 16); +} +__DEVICE__ unsigned int __vsetne4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vset4.u32.u32.ne %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vcmpne4(unsigned int __a, unsigned int __b) { + return __bool2mask(__vsetne4(__a, __b), 8); +} + +// Based on ITEM 23 in AIM-239: http://dspace.mit.edu/handle/1721.1/6086 +// (a & b) + (a | b) = a + b = (a ^ b) + 2 * (a & b) => +// (a + b) / 2 = ((a ^ b) >> 1) + (a & b) +// To operate on multiple sub-elements we need to make sure to mask out bits +// that crossed over into adjacent elements during the shift. +__DEVICE__ unsigned int __vhaddu2(unsigned int __a, unsigned int __b) { + return (((__a ^ __b) >> 1) & ~0x80008000u) + (__a & __b); +} +__DEVICE__ unsigned int __vhaddu4(unsigned int __a, unsigned int __b) { + return (((__a ^ __b) >> 1) & ~0x80808080u) + (__a & __b); +} + +__DEVICE__ unsigned int __vmaxs2(unsigned int __a, unsigned int __b) { + unsigned int r; + if ((__a & 0x8000) && (__b & 0x8000)) { + // Work around a bug in ptxas which produces invalid result if low element + // is negative. + unsigned mask = __vcmpgts2(__a, __b); + r = (__a & mask) | (__b & ~mask); + } else { + asm("vmax2.s32.s32.s32 %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + } + return r; +} +__DEVICE__ unsigned int __vmaxs4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmax4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmaxu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmax2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmaxu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmax4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmins2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmin2.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vmins4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmin4.s32.s32.s32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vminu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmin2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vminu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vmin4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsads2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff2.s32.s32.s32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsads4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff4.s32.s32.s32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsadu2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff2.u32.u32.u32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsadu4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vabsdiff4.u32.u32.u32.add %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} + +__DEVICE__ unsigned int __vsub2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vsub2.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vneg2(unsigned int __a) { return __vsub2(0, __a); } + +__DEVICE__ unsigned int __vsub4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vsub4.u32.u32.u32 %0,%1,%2,%3;" : "=r"(r) : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vneg4(unsigned int __a) { return __vsub4(0, __a); } +__DEVICE__ unsigned int __vsubss2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vsub2.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vnegss2(unsigned int __a) { + return __vsubss2(0, __a); +} +__DEVICE__ unsigned int __vsubss4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vsub4.s32.s32.s32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vnegss4(unsigned int __a) { + return __vsubss4(0, __a); +} +__DEVICE__ unsigned int __vsubus2(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vsub2.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +__DEVICE__ unsigned int __vsubus4(unsigned int __a, unsigned int __b) { + unsigned int r; + asm("vsub4.u32.u32.u32.sat %0,%1,%2,%3;" + : "=r"(r) + : "r"(__a), "r"(__b), "r"(0)); + return r; +} +#endif // CUDA_VERSION >= 9020 +__DEVICE__ int abs(int __a) { return __nv_abs(__a); } +__DEVICE__ double acos(double __a) { return __nv_acos(__a); } +__DEVICE__ float acosf(float __a) { return __nv_acosf(__a); } +__DEVICE__ double acosh(double __a) { return __nv_acosh(__a); } +__DEVICE__ float acoshf(float __a) { return __nv_acoshf(__a); } +__DEVICE__ double asin(double __a) { return __nv_asin(__a); } +__DEVICE__ float asinf(float __a) { return __nv_asinf(__a); } +__DEVICE__ double asinh(double __a) { return __nv_asinh(__a); } +__DEVICE__ float asinhf(float __a) { return __nv_asinhf(__a); } +__DEVICE__ double atan(double __a) { return __nv_atan(__a); } +__DEVICE__ double atan2(double __a, double __b) { return __nv_atan2(__a, __b); } +__DEVICE__ float atan2f(float __a, float __b) { return __nv_atan2f(__a, __b); } +__DEVICE__ float atanf(float __a) { return __nv_atanf(__a); } +__DEVICE__ double atanh(double __a) { return __nv_atanh(__a); } +__DEVICE__ float atanhf(float __a) { return __nv_atanhf(__a); } +__DEVICE__ double cbrt(double __a) { return __nv_cbrt(__a); } +__DEVICE__ float cbrtf(float __a) { return __nv_cbrtf(__a); } +__DEVICE__ double ceil(double __a) { return __nv_ceil(__a); } +__DEVICE__ float ceilf(float __a) { return __nv_ceilf(__a); } +__DEVICE__ int clock() { return __nvvm_read_ptx_sreg_clock(); } +__DEVICE__ long long clock64() { return __nvvm_read_ptx_sreg_clock64(); } +__DEVICE__ double copysign(double __a, double __b) { + return __nv_copysign(__a, __b); +} +__DEVICE__ float copysignf(float __a, float __b) { + return __nv_copysignf(__a, __b); +} +__DEVICE__ double cos(double __a) { return __nv_cos(__a); } +__DEVICE__ float cosf(float __a) { + return __FAST_OR_SLOW(__nv_fast_cosf, __nv_cosf)(__a); +} +__DEVICE__ double cosh(double __a) { return __nv_cosh(__a); } +__DEVICE__ float coshf(float __a) { return __nv_coshf(__a); } +__DEVICE__ double cospi(double __a) { return __nv_cospi(__a); } +__DEVICE__ float cospif(float __a) { return __nv_cospif(__a); } +__DEVICE__ double cyl_bessel_i0(double __a) { return __nv_cyl_bessel_i0(__a); } +__DEVICE__ float cyl_bessel_i0f(float __a) { return __nv_cyl_bessel_i0f(__a); } +__DEVICE__ double cyl_bessel_i1(double __a) { return __nv_cyl_bessel_i1(__a); } +__DEVICE__ float cyl_bessel_i1f(float __a) { return __nv_cyl_bessel_i1f(__a); } +__DEVICE__ double erf(double __a) { return __nv_erf(__a); } +__DEVICE__ double erfc(double __a) { return __nv_erfc(__a); } +__DEVICE__ float erfcf(float __a) { return __nv_erfcf(__a); } +__DEVICE__ double erfcinv(double __a) { return __nv_erfcinv(__a); } +__DEVICE__ float erfcinvf(float __a) { return __nv_erfcinvf(__a); } +__DEVICE__ double erfcx(double __a) { return __nv_erfcx(__a); } +__DEVICE__ float erfcxf(float __a) { return __nv_erfcxf(__a); } +__DEVICE__ float erff(float __a) { return __nv_erff(__a); } +__DEVICE__ double erfinv(double __a) { return __nv_erfinv(__a); } +__DEVICE__ float erfinvf(float __a) { return __nv_erfinvf(__a); } +__DEVICE__ double exp(double __a) { return __nv_exp(__a); } +__DEVICE__ double exp10(double __a) { return __nv_exp10(__a); } +__DEVICE__ float exp10f(float __a) { return __nv_exp10f(__a); } +__DEVICE__ double exp2(double __a) { return __nv_exp2(__a); } +__DEVICE__ float exp2f(float __a) { return __nv_exp2f(__a); } +__DEVICE__ float expf(float __a) { return __nv_expf(__a); } +__DEVICE__ double expm1(double __a) { return __nv_expm1(__a); } +__DEVICE__ float expm1f(float __a) { return __nv_expm1f(__a); } +__DEVICE__ double fabs(double __a) { return __nv_fabs(__a); } +__DEVICE__ float fabsf(float __a) { return __nv_fabsf(__a); } +__DEVICE__ double fdim(double __a, double __b) { return __nv_fdim(__a, __b); } +__DEVICE__ float fdimf(float __a, float __b) { return __nv_fdimf(__a, __b); } +__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; } +__DEVICE__ float fdividef(float __a, float __b) { +#if __FAST_MATH__ && !__CUDA_PREC_DIV + return __nv_fast_fdividef(__a, __b); +#else + return __a / __b; +#endif +} +__DEVICE__ double floor(double __f) { return __nv_floor(__f); } +__DEVICE__ float floorf(float __f) { return __nv_floorf(__f); } +__DEVICE__ double fma(double __a, double __b, double __c) { + return __nv_fma(__a, __b, __c); +} +__DEVICE__ float fmaf(float __a, float __b, float __c) { + return __nv_fmaf(__a, __b, __c); +} +__DEVICE__ double fmax(double __a, double __b) { return __nv_fmax(__a, __b); } +__DEVICE__ float fmaxf(float __a, float __b) { return __nv_fmaxf(__a, __b); } +__DEVICE__ double fmin(double __a, double __b) { return __nv_fmin(__a, __b); } +__DEVICE__ float fminf(float __a, float __b) { return __nv_fminf(__a, __b); } +__DEVICE__ double fmod(double __a, double __b) { return __nv_fmod(__a, __b); } +__DEVICE__ float fmodf(float __a, float __b) { return __nv_fmodf(__a, __b); } +__DEVICE__ double frexp(double __a, int *__b) { return __nv_frexp(__a, __b); } +__DEVICE__ float frexpf(float __a, int *__b) { return __nv_frexpf(__a, __b); } +__DEVICE__ double hypot(double __a, double __b) { return __nv_hypot(__a, __b); } +__DEVICE__ float hypotf(float __a, float __b) { return __nv_hypotf(__a, __b); } +__DEVICE__ int ilogb(double __a) { return __nv_ilogb(__a); } +__DEVICE__ int ilogbf(float __a) { return __nv_ilogbf(__a); } +__DEVICE__ double j0(double __a) { return __nv_j0(__a); } +__DEVICE__ float j0f(float __a) { return __nv_j0f(__a); } +__DEVICE__ double j1(double __a) { return __nv_j1(__a); } +__DEVICE__ float j1f(float __a) { return __nv_j1f(__a); } +__DEVICE__ double jn(int __n, double __a) { return __nv_jn(__n, __a); } +__DEVICE__ float jnf(int __n, float __a) { return __nv_jnf(__n, __a); } +#if defined(__LP64__) +__DEVICE__ long labs(long __a) { return llabs(__a); }; +#else +__DEVICE__ long labs(long __a) { return __nv_abs(__a); }; +#endif +__DEVICE__ double ldexp(double __a, int __b) { return __nv_ldexp(__a, __b); } +__DEVICE__ float ldexpf(float __a, int __b) { return __nv_ldexpf(__a, __b); } +__DEVICE__ double lgamma(double __a) { return __nv_lgamma(__a); } +__DEVICE__ float lgammaf(float __a) { return __nv_lgammaf(__a); } +__DEVICE__ long long llabs(long long __a) { return __nv_llabs(__a); } +__DEVICE__ long long llmax(long long __a, long long __b) { + return __nv_llmax(__a, __b); +} +__DEVICE__ long long llmin(long long __a, long long __b) { + return __nv_llmin(__a, __b); +} +__DEVICE__ long long llrint(double __a) { return __nv_llrint(__a); } +__DEVICE__ long long llrintf(float __a) { return __nv_llrintf(__a); } +__DEVICE__ long long llround(double __a) { return __nv_llround(__a); } +__DEVICE__ long long llroundf(float __a) { return __nv_llroundf(__a); } +__DEVICE__ double log(double __a) { return __nv_log(__a); } +__DEVICE__ double log10(double __a) { return __nv_log10(__a); } +__DEVICE__ float log10f(float __a) { return __nv_log10f(__a); } +__DEVICE__ double log1p(double __a) { return __nv_log1p(__a); } +__DEVICE__ float log1pf(float __a) { return __nv_log1pf(__a); } +__DEVICE__ double log2(double __a) { return __nv_log2(__a); } +__DEVICE__ float log2f(float __a) { + return __FAST_OR_SLOW(__nv_fast_log2f, __nv_log2f)(__a); +} +__DEVICE__ double logb(double __a) { return __nv_logb(__a); } +__DEVICE__ float logbf(float __a) { return __nv_logbf(__a); } +__DEVICE__ float logf(float __a) { + return __FAST_OR_SLOW(__nv_fast_logf, __nv_logf)(__a); +} +#if defined(__LP64__) +__DEVICE__ long lrint(double __a) { return llrint(__a); } +__DEVICE__ long lrintf(float __a) { return __float2ll_rn(__a); } +__DEVICE__ long lround(double __a) { return llround(__a); } +__DEVICE__ long lroundf(float __a) { return llroundf(__a); } +#else +__DEVICE__ long lrint(double __a) { return (long)rint(__a); } +__DEVICE__ long lrintf(float __a) { return __float2int_rn(__a); } +__DEVICE__ long lround(double __a) { return round(__a); } +__DEVICE__ long lroundf(float __a) { return roundf(__a); } +#endif +__DEVICE__ int max(int __a, int __b) { return __nv_max(__a, __b); } +__DEVICE__ void *memcpy(void *__a, const void *__b, size_t __c) { + return __builtin_memcpy(__a, __b, __c); +} +__DEVICE__ void *memset(void *__a, int __b, size_t __c) { + return __builtin_memset(__a, __b, __c); +} +__DEVICE__ int min(int __a, int __b) { return __nv_min(__a, __b); } +__DEVICE__ double modf(double __a, double *__b) { return __nv_modf(__a, __b); } +__DEVICE__ float modff(float __a, float *__b) { return __nv_modff(__a, __b); } +__DEVICE__ double nearbyint(double __a) { return __nv_nearbyint(__a); } +__DEVICE__ float nearbyintf(float __a) { return __nv_nearbyintf(__a); } +__DEVICE__ double nextafter(double __a, double __b) { + return __nv_nextafter(__a, __b); +} +__DEVICE__ float nextafterf(float __a, float __b) { + return __nv_nextafterf(__a, __b); +} +__DEVICE__ double norm(int __dim, const double *__t) { + return __nv_norm(__dim, __t); +} +__DEVICE__ double norm3d(double __a, double __b, double __c) { + return __nv_norm3d(__a, __b, __c); +} +__DEVICE__ float norm3df(float __a, float __b, float __c) { + return __nv_norm3df(__a, __b, __c); +} +__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) { + return __nv_norm4d(__a, __b, __c, __d); +} +__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) { + return __nv_norm4df(__a, __b, __c, __d); +} +__DEVICE__ double normcdf(double __a) { return __nv_normcdf(__a); } +__DEVICE__ float normcdff(float __a) { return __nv_normcdff(__a); } +__DEVICE__ double normcdfinv(double __a) { return __nv_normcdfinv(__a); } +__DEVICE__ float normcdfinvf(float __a) { return __nv_normcdfinvf(__a); } +__DEVICE__ float normf(int __dim, const float *__t) { + return __nv_normf(__dim, __t); +} +__DEVICE__ double pow(double __a, double __b) { return __nv_pow(__a, __b); } +__DEVICE__ float powf(float __a, float __b) { return __nv_powf(__a, __b); } +__DEVICE__ double powi(double __a, int __b) { return __nv_powi(__a, __b); } +__DEVICE__ float powif(float __a, int __b) { return __nv_powif(__a, __b); } +__DEVICE__ double rcbrt(double __a) { return __nv_rcbrt(__a); } +__DEVICE__ float rcbrtf(float __a) { return __nv_rcbrtf(__a); } +__DEVICE__ double remainder(double __a, double __b) { + return __nv_remainder(__a, __b); +} +__DEVICE__ float remainderf(float __a, float __b) { + return __nv_remainderf(__a, __b); +} +__DEVICE__ double remquo(double __a, double __b, int *__c) { + return __nv_remquo(__a, __b, __c); +} +__DEVICE__ float remquof(float __a, float __b, int *__c) { + return __nv_remquof(__a, __b, __c); +} +__DEVICE__ double rhypot(double __a, double __b) { + return __nv_rhypot(__a, __b); +} +__DEVICE__ float rhypotf(float __a, float __b) { + return __nv_rhypotf(__a, __b); +} +__DEVICE__ double rint(double __a) { return __nv_rint(__a); } +__DEVICE__ float rintf(float __a) { return __nv_rintf(__a); } +__DEVICE__ double rnorm(int __a, const double *__b) { + return __nv_rnorm(__a, __b); +} +__DEVICE__ double rnorm3d(double __a, double __b, double __c) { + return __nv_rnorm3d(__a, __b, __c); +} +__DEVICE__ float rnorm3df(float __a, float __b, float __c) { + return __nv_rnorm3df(__a, __b, __c); +} +__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) { + return __nv_rnorm4d(__a, __b, __c, __d); +} +__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) { + return __nv_rnorm4df(__a, __b, __c, __d); +} +__DEVICE__ float rnormf(int __dim, const float *__t) { + return __nv_rnormf(__dim, __t); +} +__DEVICE__ double round(double __a) { return __nv_round(__a); } +__DEVICE__ float roundf(float __a) { return __nv_roundf(__a); } +__DEVICE__ double rsqrt(double __a) { return __nv_rsqrt(__a); } +__DEVICE__ float rsqrtf(float __a) { return __nv_rsqrtf(__a); } +__DEVICE__ double scalbn(double __a, int __b) { return __nv_scalbn(__a, __b); } +__DEVICE__ float scalbnf(float __a, int __b) { return __nv_scalbnf(__a, __b); } +__DEVICE__ double scalbln(double __a, long __b) { + if (__b > INT_MAX) + return __a > 0 ? HUGE_VAL : -HUGE_VAL; + if (__b < INT_MIN) + return __a > 0 ? 0.0 : -0.0; + return scalbn(__a, (int)__b); +} +__DEVICE__ float scalblnf(float __a, long __b) { + if (__b > INT_MAX) + return __a > 0 ? HUGE_VALF : -HUGE_VALF; + if (__b < INT_MIN) + return __a > 0 ? 0.f : -0.f; + return scalbnf(__a, (int)__b); +} +__DEVICE__ double sin(double __a) { return __nv_sin(__a); } +__DEVICE__ void sincos(double __a, double *__sptr, double *__cptr) { + return __nv_sincos(__a, __sptr, __cptr); +} +__DEVICE__ void sincosf(float __a, float *__sptr, float *__cptr) { + return __FAST_OR_SLOW(__nv_fast_sincosf, __nv_sincosf)(__a, __sptr, __cptr); +} +__DEVICE__ void sincospi(double __a, double *__sptr, double *__cptr) { + return __nv_sincospi(__a, __sptr, __cptr); +} +__DEVICE__ void sincospif(float __a, float *__sptr, float *__cptr) { + return __nv_sincospif(__a, __sptr, __cptr); +} +__DEVICE__ float sinf(float __a) { + return __FAST_OR_SLOW(__nv_fast_sinf, __nv_sinf)(__a); +} +__DEVICE__ double sinh(double __a) { return __nv_sinh(__a); } +__DEVICE__ float sinhf(float __a) { return __nv_sinhf(__a); } +__DEVICE__ double sinpi(double __a) { return __nv_sinpi(__a); } +__DEVICE__ float sinpif(float __a) { return __nv_sinpif(__a); } +__DEVICE__ double sqrt(double __a) { return __nv_sqrt(__a); } +__DEVICE__ float sqrtf(float __a) { return __nv_sqrtf(__a); } +__DEVICE__ double tan(double __a) { return __nv_tan(__a); } +__DEVICE__ float tanf(float __a) { return __nv_tanf(__a); } +__DEVICE__ double tanh(double __a) { return __nv_tanh(__a); } +__DEVICE__ float tanhf(float __a) { return __nv_tanhf(__a); } +__DEVICE__ double tgamma(double __a) { return __nv_tgamma(__a); } +__DEVICE__ float tgammaf(float __a) { return __nv_tgammaf(__a); } +__DEVICE__ double trunc(double __a) { return __nv_trunc(__a); } +__DEVICE__ float truncf(float __a) { return __nv_truncf(__a); } +__DEVICE__ unsigned long long ullmax(unsigned long long __a, + unsigned long long __b) { + return __nv_ullmax(__a, __b); +} +__DEVICE__ unsigned long long ullmin(unsigned long long __a, + unsigned long long __b) { + return __nv_ullmin(__a, __b); +} +__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) { + return __nv_umax(__a, __b); +} +__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) { + return __nv_umin(__a, __b); +} +__DEVICE__ double y0(double __a) { return __nv_y0(__a); } +__DEVICE__ float y0f(float __a) { return __nv_y0f(__a); } +__DEVICE__ double y1(double __a) { return __nv_y1(__a); } +__DEVICE__ float y1f(float __a) { return __nv_y1f(__a); } +__DEVICE__ double yn(int __a, double __b) { return __nv_yn(__a, __b); } +__DEVICE__ float ynf(int __a, float __b) { return __nv_ynf(__a, __b); } + +#pragma pop_macro("__DEVICE__") +#pragma pop_macro("__FAST_OR_SLOW") +#endif // __CLANG_CUDA_DEVICE_FUNCTIONS_H__ diff --git a/c_headers/__clang_cuda_intrinsics.h b/c_headers/__clang_cuda_intrinsics.h index 1794eb3dc1..3c0cde94ed 100644 --- a/c_headers/__clang_cuda_intrinsics.h +++ b/c_headers/__clang_cuda_intrinsics.h @@ -277,6 +277,9 @@ inline __device__ long long __ldg(const long long *ptr) { inline __device__ unsigned char __ldg(const unsigned char *ptr) { return __nvvm_ldg_uc(ptr); } +inline __device__ signed char __ldg(const signed char *ptr) { + return __nvvm_ldg_uc((const unsigned char *)ptr); +} inline __device__ unsigned short __ldg(const unsigned short *ptr) { return __nvvm_ldg_us(ptr); } diff --git a/c_headers/__clang_cuda_libdevice_declares.h b/c_headers/__clang_cuda_libdevice_declares.h new file mode 100644 index 0000000000..71df7f849d --- /dev/null +++ b/c_headers/__clang_cuda_libdevice_declares.h @@ -0,0 +1,466 @@ +/*===-- __clang_cuda_libdevice_declares.h - decls for libdevice functions --=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ + +extern "C" { + +__device__ int __nv_abs(int __a); +__device__ double __nv_acos(double __a); +__device__ float __nv_acosf(float __a); +__device__ double __nv_acosh(double __a); +__device__ float __nv_acoshf(float __a); +__device__ double __nv_asin(double __a); +__device__ float __nv_asinf(float __a); +__device__ double __nv_asinh(double __a); +__device__ float __nv_asinhf(float __a); +__device__ double __nv_atan2(double __a, double __b); +__device__ float __nv_atan2f(float __a, float __b); +__device__ double __nv_atan(double __a); +__device__ float __nv_atanf(float __a); +__device__ double __nv_atanh(double __a); +__device__ float __nv_atanhf(float __a); +__device__ int __nv_brev(int __a); +__device__ long long __nv_brevll(long long __a); +__device__ int __nv_byte_perm(int __a, int __b, int __c); +__device__ double __nv_cbrt(double __a); +__device__ float __nv_cbrtf(float __a); +__device__ double __nv_ceil(double __a); +__device__ float __nv_ceilf(float __a); +__device__ int __nv_clz(int __a); +__device__ int __nv_clzll(long long __a); +__device__ double __nv_copysign(double __a, double __b); +__device__ float __nv_copysignf(float __a, float __b); +__device__ double __nv_cos(double __a); +__device__ float __nv_cosf(float __a); +__device__ double __nv_cosh(double __a); +__device__ float __nv_coshf(float __a); +__device__ double __nv_cospi(double __a); +__device__ float __nv_cospif(float __a); +__device__ double __nv_cyl_bessel_i0(double __a); +__device__ float __nv_cyl_bessel_i0f(float __a); +__device__ double __nv_cyl_bessel_i1(double __a); +__device__ float __nv_cyl_bessel_i1f(float __a); +__device__ double __nv_dadd_rd(double __a, double __b); +__device__ double __nv_dadd_rn(double __a, double __b); +__device__ double __nv_dadd_ru(double __a, double __b); +__device__ double __nv_dadd_rz(double __a, double __b); +__device__ double __nv_ddiv_rd(double __a, double __b); +__device__ double __nv_ddiv_rn(double __a, double __b); +__device__ double __nv_ddiv_ru(double __a, double __b); +__device__ double __nv_ddiv_rz(double __a, double __b); +__device__ double __nv_dmul_rd(double __a, double __b); +__device__ double __nv_dmul_rn(double __a, double __b); +__device__ double __nv_dmul_ru(double __a, double __b); +__device__ double __nv_dmul_rz(double __a, double __b); +__device__ float __nv_double2float_rd(double __a); +__device__ float __nv_double2float_rn(double __a); +__device__ float __nv_double2float_ru(double __a); +__device__ float __nv_double2float_rz(double __a); +__device__ int __nv_double2hiint(double __a); +__device__ int __nv_double2int_rd(double __a); +__device__ int __nv_double2int_rn(double __a); +__device__ int __nv_double2int_ru(double __a); +__device__ int __nv_double2int_rz(double __a); +__device__ long long __nv_double2ll_rd(double __a); +__device__ long long __nv_double2ll_rn(double __a); +__device__ long long __nv_double2ll_ru(double __a); +__device__ long long __nv_double2ll_rz(double __a); +__device__ int __nv_double2loint(double __a); +__device__ unsigned int __nv_double2uint_rd(double __a); +__device__ unsigned int __nv_double2uint_rn(double __a); +__device__ unsigned int __nv_double2uint_ru(double __a); +__device__ unsigned int __nv_double2uint_rz(double __a); +__device__ unsigned long long __nv_double2ull_rd(double __a); +__device__ unsigned long long __nv_double2ull_rn(double __a); +__device__ unsigned long long __nv_double2ull_ru(double __a); +__device__ unsigned long long __nv_double2ull_rz(double __a); +__device__ unsigned long long __nv_double_as_longlong(double __a); +__device__ double __nv_drcp_rd(double __a); +__device__ double __nv_drcp_rn(double __a); +__device__ double __nv_drcp_ru(double __a); +__device__ double __nv_drcp_rz(double __a); +__device__ double __nv_dsqrt_rd(double __a); +__device__ double __nv_dsqrt_rn(double __a); +__device__ double __nv_dsqrt_ru(double __a); +__device__ double __nv_dsqrt_rz(double __a); +__device__ double __nv_dsub_rd(double __a, double __b); +__device__ double __nv_dsub_rn(double __a, double __b); +__device__ double __nv_dsub_ru(double __a, double __b); +__device__ double __nv_dsub_rz(double __a, double __b); +__device__ double __nv_erfc(double __a); +__device__ float __nv_erfcf(float __a); +__device__ double __nv_erfcinv(double __a); +__device__ float __nv_erfcinvf(float __a); +__device__ double __nv_erfcx(double __a); +__device__ float __nv_erfcxf(float __a); +__device__ double __nv_erf(double __a); +__device__ float __nv_erff(float __a); +__device__ double __nv_erfinv(double __a); +__device__ float __nv_erfinvf(float __a); +__device__ double __nv_exp10(double __a); +__device__ float __nv_exp10f(float __a); +__device__ double __nv_exp2(double __a); +__device__ float __nv_exp2f(float __a); +__device__ double __nv_exp(double __a); +__device__ float __nv_expf(float __a); +__device__ double __nv_expm1(double __a); +__device__ float __nv_expm1f(float __a); +__device__ double __nv_fabs(double __a); +__device__ float __nv_fabsf(float __a); +__device__ float __nv_fadd_rd(float __a, float __b); +__device__ float __nv_fadd_rn(float __a, float __b); +__device__ float __nv_fadd_ru(float __a, float __b); +__device__ float __nv_fadd_rz(float __a, float __b); +__device__ float __nv_fast_cosf(float __a); +__device__ float __nv_fast_exp10f(float __a); +__device__ float __nv_fast_expf(float __a); +__device__ float __nv_fast_fdividef(float __a, float __b); +__device__ float __nv_fast_log10f(float __a); +__device__ float __nv_fast_log2f(float __a); +__device__ float __nv_fast_logf(float __a); +__device__ float __nv_fast_powf(float __a, float __b); +__device__ void __nv_fast_sincosf(float __a, float *__sptr, float *__cptr); +__device__ float __nv_fast_sinf(float __a); +__device__ float __nv_fast_tanf(float __a); +__device__ double __nv_fdim(double __a, double __b); +__device__ float __nv_fdimf(float __a, float __b); +__device__ float __nv_fdiv_rd(float __a, float __b); +__device__ float __nv_fdiv_rn(float __a, float __b); +__device__ float __nv_fdiv_ru(float __a, float __b); +__device__ float __nv_fdiv_rz(float __a, float __b); +__device__ int __nv_ffs(int __a); +__device__ int __nv_ffsll(long long __a); +__device__ int __nv_finitef(float __a); +__device__ unsigned short __nv_float2half_rn(float __a); +__device__ int __nv_float2int_rd(float __a); +__device__ int __nv_float2int_rn(float __a); +__device__ int __nv_float2int_ru(float __a); +__device__ int __nv_float2int_rz(float __a); +__device__ long long __nv_float2ll_rd(float __a); +__device__ long long __nv_float2ll_rn(float __a); +__device__ long long __nv_float2ll_ru(float __a); +__device__ long long __nv_float2ll_rz(float __a); +__device__ unsigned int __nv_float2uint_rd(float __a); +__device__ unsigned int __nv_float2uint_rn(float __a); +__device__ unsigned int __nv_float2uint_ru(float __a); +__device__ unsigned int __nv_float2uint_rz(float __a); +__device__ unsigned long long __nv_float2ull_rd(float __a); +__device__ unsigned long long __nv_float2ull_rn(float __a); +__device__ unsigned long long __nv_float2ull_ru(float __a); +__device__ unsigned long long __nv_float2ull_rz(float __a); +__device__ int __nv_float_as_int(float __a); +__device__ unsigned int __nv_float_as_uint(float __a); +__device__ double __nv_floor(double __a); +__device__ float __nv_floorf(float __a); +__device__ double __nv_fma(double __a, double __b, double __c); +__device__ float __nv_fmaf(float __a, float __b, float __c); +__device__ float __nv_fmaf_ieee_rd(float __a, float __b, float __c); +__device__ float __nv_fmaf_ieee_rn(float __a, float __b, float __c); +__device__ float __nv_fmaf_ieee_ru(float __a, float __b, float __c); +__device__ float __nv_fmaf_ieee_rz(float __a, float __b, float __c); +__device__ float __nv_fmaf_rd(float __a, float __b, float __c); +__device__ float __nv_fmaf_rn(float __a, float __b, float __c); +__device__ float __nv_fmaf_ru(float __a, float __b, float __c); +__device__ float __nv_fmaf_rz(float __a, float __b, float __c); +__device__ double __nv_fma_rd(double __a, double __b, double __c); +__device__ double __nv_fma_rn(double __a, double __b, double __c); +__device__ double __nv_fma_ru(double __a, double __b, double __c); +__device__ double __nv_fma_rz(double __a, double __b, double __c); +__device__ double __nv_fmax(double __a, double __b); +__device__ float __nv_fmaxf(float __a, float __b); +__device__ double __nv_fmin(double __a, double __b); +__device__ float __nv_fminf(float __a, float __b); +__device__ double __nv_fmod(double __a, double __b); +__device__ float __nv_fmodf(float __a, float __b); +__device__ float __nv_fmul_rd(float __a, float __b); +__device__ float __nv_fmul_rn(float __a, float __b); +__device__ float __nv_fmul_ru(float __a, float __b); +__device__ float __nv_fmul_rz(float __a, float __b); +__device__ float __nv_frcp_rd(float __a); +__device__ float __nv_frcp_rn(float __a); +__device__ float __nv_frcp_ru(float __a); +__device__ float __nv_frcp_rz(float __a); +__device__ double __nv_frexp(double __a, int *__b); +__device__ float __nv_frexpf(float __a, int *__b); +__device__ float __nv_frsqrt_rn(float __a); +__device__ float __nv_fsqrt_rd(float __a); +__device__ float __nv_fsqrt_rn(float __a); +__device__ float __nv_fsqrt_ru(float __a); +__device__ float __nv_fsqrt_rz(float __a); +__device__ float __nv_fsub_rd(float __a, float __b); +__device__ float __nv_fsub_rn(float __a, float __b); +__device__ float __nv_fsub_ru(float __a, float __b); +__device__ float __nv_fsub_rz(float __a, float __b); +__device__ int __nv_hadd(int __a, int __b); +__device__ float __nv_half2float(unsigned short __h); +__device__ double __nv_hiloint2double(int __a, int __b); +__device__ double __nv_hypot(double __a, double __b); +__device__ float __nv_hypotf(float __a, float __b); +__device__ int __nv_ilogb(double __a); +__device__ int __nv_ilogbf(float __a); +__device__ double __nv_int2double_rn(int __a); +__device__ float __nv_int2float_rd(int __a); +__device__ float __nv_int2float_rn(int __a); +__device__ float __nv_int2float_ru(int __a); +__device__ float __nv_int2float_rz(int __a); +__device__ float __nv_int_as_float(int __a); +__device__ int __nv_isfinited(double __a); +__device__ int __nv_isinfd(double __a); +__device__ int __nv_isinff(float __a); +__device__ int __nv_isnand(double __a); +__device__ int __nv_isnanf(float __a); +__device__ double __nv_j0(double __a); +__device__ float __nv_j0f(float __a); +__device__ double __nv_j1(double __a); +__device__ float __nv_j1f(float __a); +__device__ float __nv_jnf(int __a, float __b); +__device__ double __nv_jn(int __a, double __b); +__device__ double __nv_ldexp(double __a, int __b); +__device__ float __nv_ldexpf(float __a, int __b); +__device__ double __nv_lgamma(double __a); +__device__ float __nv_lgammaf(float __a); +__device__ double __nv_ll2double_rd(long long __a); +__device__ double __nv_ll2double_rn(long long __a); +__device__ double __nv_ll2double_ru(long long __a); +__device__ double __nv_ll2double_rz(long long __a); +__device__ float __nv_ll2float_rd(long long __a); +__device__ float __nv_ll2float_rn(long long __a); +__device__ float __nv_ll2float_ru(long long __a); +__device__ float __nv_ll2float_rz(long long __a); +__device__ long long __nv_llabs(long long __a); +__device__ long long __nv_llmax(long long __a, long long __b); +__device__ long long __nv_llmin(long long __a, long long __b); +__device__ long long __nv_llrint(double __a); +__device__ long long __nv_llrintf(float __a); +__device__ long long __nv_llround(double __a); +__device__ long long __nv_llroundf(float __a); +__device__ double __nv_log10(double __a); +__device__ float __nv_log10f(float __a); +__device__ double __nv_log1p(double __a); +__device__ float __nv_log1pf(float __a); +__device__ double __nv_log2(double __a); +__device__ float __nv_log2f(float __a); +__device__ double __nv_logb(double __a); +__device__ float __nv_logbf(float __a); +__device__ double __nv_log(double __a); +__device__ float __nv_logf(float __a); +__device__ double __nv_longlong_as_double(long long __a); +__device__ int __nv_max(int __a, int __b); +__device__ int __nv_min(int __a, int __b); +__device__ double __nv_modf(double __a, double *__b); +__device__ float __nv_modff(float __a, float *__b); +__device__ int __nv_mul24(int __a, int __b); +__device__ long long __nv_mul64hi(long long __a, long long __b); +__device__ int __nv_mulhi(int __a, int __b); +__device__ double __nv_nan(const signed char *__a); +__device__ float __nv_nanf(const signed char *__a); +__device__ double __nv_nearbyint(double __a); +__device__ float __nv_nearbyintf(float __a); +__device__ double __nv_nextafter(double __a, double __b); +__device__ float __nv_nextafterf(float __a, float __b); +__device__ double __nv_norm3d(double __a, double __b, double __c); +__device__ float __nv_norm3df(float __a, float __b, float __c); +__device__ double __nv_norm4d(double __a, double __b, double __c, double __d); +__device__ float __nv_norm4df(float __a, float __b, float __c, float __d); +__device__ double __nv_normcdf(double __a); +__device__ float __nv_normcdff(float __a); +__device__ double __nv_normcdfinv(double __a); +__device__ float __nv_normcdfinvf(float __a); +__device__ float __nv_normf(int __a, const float *__b); +__device__ double __nv_norm(int __a, const double *__b); +__device__ int __nv_popc(int __a); +__device__ int __nv_popcll(long long __a); +__device__ double __nv_pow(double __a, double __b); +__device__ float __nv_powf(float __a, float __b); +__device__ double __nv_powi(double __a, int __b); +__device__ float __nv_powif(float __a, int __b); +__device__ double __nv_rcbrt(double __a); +__device__ float __nv_rcbrtf(float __a); +__device__ double __nv_rcp64h(double __a); +__device__ double __nv_remainder(double __a, double __b); +__device__ float __nv_remainderf(float __a, float __b); +__device__ double __nv_remquo(double __a, double __b, int *__c); +__device__ float __nv_remquof(float __a, float __b, int *__c); +__device__ int __nv_rhadd(int __a, int __b); +__device__ double __nv_rhypot(double __a, double __b); +__device__ float __nv_rhypotf(float __a, float __b); +__device__ double __nv_rint(double __a); +__device__ float __nv_rintf(float __a); +__device__ double __nv_rnorm3d(double __a, double __b, double __c); +__device__ float __nv_rnorm3df(float __a, float __b, float __c); +__device__ double __nv_rnorm4d(double __a, double __b, double __c, double __d); +__device__ float __nv_rnorm4df(float __a, float __b, float __c, float __d); +__device__ float __nv_rnormf(int __a, const float *__b); +__device__ double __nv_rnorm(int __a, const double *__b); +__device__ double __nv_round(double __a); +__device__ float __nv_roundf(float __a); +__device__ double __nv_rsqrt(double __a); +__device__ float __nv_rsqrtf(float __a); +__device__ int __nv_sad(int __a, int __b, int __c); +__device__ float __nv_saturatef(float __a); +__device__ double __nv_scalbn(double __a, int __b); +__device__ float __nv_scalbnf(float __a, int __b); +__device__ int __nv_signbitd(double __a); +__device__ int __nv_signbitf(float __a); +__device__ void __nv_sincos(double __a, double *__b, double *__c); +__device__ void __nv_sincosf(float __a, float *__b, float *__c); +__device__ void __nv_sincospi(double __a, double *__b, double *__c); +__device__ void __nv_sincospif(float __a, float *__b, float *__c); +__device__ double __nv_sin(double __a); +__device__ float __nv_sinf(float __a); +__device__ double __nv_sinh(double __a); +__device__ float __nv_sinhf(float __a); +__device__ double __nv_sinpi(double __a); +__device__ float __nv_sinpif(float __a); +__device__ double __nv_sqrt(double __a); +__device__ float __nv_sqrtf(float __a); +__device__ double __nv_tan(double __a); +__device__ float __nv_tanf(float __a); +__device__ double __nv_tanh(double __a); +__device__ float __nv_tanhf(float __a); +__device__ double __nv_tgamma(double __a); +__device__ float __nv_tgammaf(float __a); +__device__ double __nv_trunc(double __a); +__device__ float __nv_truncf(float __a); +__device__ int __nv_uhadd(unsigned int __a, unsigned int __b); +__device__ double __nv_uint2double_rn(unsigned int __i); +__device__ float __nv_uint2float_rd(unsigned int __a); +__device__ float __nv_uint2float_rn(unsigned int __a); +__device__ float __nv_uint2float_ru(unsigned int __a); +__device__ float __nv_uint2float_rz(unsigned int __a); +__device__ float __nv_uint_as_float(unsigned int __a); +__device__ double __nv_ull2double_rd(unsigned long long __a); +__device__ double __nv_ull2double_rn(unsigned long long __a); +__device__ double __nv_ull2double_ru(unsigned long long __a); +__device__ double __nv_ull2double_rz(unsigned long long __a); +__device__ float __nv_ull2float_rd(unsigned long long __a); +__device__ float __nv_ull2float_rn(unsigned long long __a); +__device__ float __nv_ull2float_ru(unsigned long long __a); +__device__ float __nv_ull2float_rz(unsigned long long __a); +__device__ unsigned long long __nv_ullmax(unsigned long long __a, + unsigned long long __b); +__device__ unsigned long long __nv_ullmin(unsigned long long __a, + unsigned long long __b); +__device__ unsigned int __nv_umax(unsigned int __a, unsigned int __b); +__device__ unsigned int __nv_umin(unsigned int __a, unsigned int __b); +__device__ unsigned int __nv_umul24(unsigned int __a, unsigned int __b); +__device__ unsigned long long __nv_umul64hi(unsigned long long __a, + unsigned long long __b); +__device__ unsigned int __nv_umulhi(unsigned int __a, unsigned int __b); +__device__ unsigned int __nv_urhadd(unsigned int __a, unsigned int __b); +__device__ unsigned int __nv_usad(unsigned int __a, unsigned int __b, + unsigned int __c); +#if CUDA_VERSION >= 9000 && CUDA_VERSION < 9020 +__device__ int __nv_vabs2(int __a); +__device__ int __nv_vabs4(int __a); +__device__ int __nv_vabsdiffs2(int __a, int __b); +__device__ int __nv_vabsdiffs4(int __a, int __b); +__device__ int __nv_vabsdiffu2(int __a, int __b); +__device__ int __nv_vabsdiffu4(int __a, int __b); +__device__ int __nv_vabsss2(int __a); +__device__ int __nv_vabsss4(int __a); +__device__ int __nv_vadd2(int __a, int __b); +__device__ int __nv_vadd4(int __a, int __b); +__device__ int __nv_vaddss2(int __a, int __b); +__device__ int __nv_vaddss4(int __a, int __b); +__device__ int __nv_vaddus2(int __a, int __b); +__device__ int __nv_vaddus4(int __a, int __b); +__device__ int __nv_vavgs2(int __a, int __b); +__device__ int __nv_vavgs4(int __a, int __b); +__device__ int __nv_vavgu2(int __a, int __b); +__device__ int __nv_vavgu4(int __a, int __b); +__device__ int __nv_vcmpeq2(int __a, int __b); +__device__ int __nv_vcmpeq4(int __a, int __b); +__device__ int __nv_vcmpges2(int __a, int __b); +__device__ int __nv_vcmpges4(int __a, int __b); +__device__ int __nv_vcmpgeu2(int __a, int __b); +__device__ int __nv_vcmpgeu4(int __a, int __b); +__device__ int __nv_vcmpgts2(int __a, int __b); +__device__ int __nv_vcmpgts4(int __a, int __b); +__device__ int __nv_vcmpgtu2(int __a, int __b); +__device__ int __nv_vcmpgtu4(int __a, int __b); +__device__ int __nv_vcmples2(int __a, int __b); +__device__ int __nv_vcmples4(int __a, int __b); +__device__ int __nv_vcmpleu2(int __a, int __b); +__device__ int __nv_vcmpleu4(int __a, int __b); +__device__ int __nv_vcmplts2(int __a, int __b); +__device__ int __nv_vcmplts4(int __a, int __b); +__device__ int __nv_vcmpltu2(int __a, int __b); +__device__ int __nv_vcmpltu4(int __a, int __b); +__device__ int __nv_vcmpne2(int __a, int __b); +__device__ int __nv_vcmpne4(int __a, int __b); +__device__ int __nv_vhaddu2(int __a, int __b); +__device__ int __nv_vhaddu4(int __a, int __b); +__device__ int __nv_vmaxs2(int __a, int __b); +__device__ int __nv_vmaxs4(int __a, int __b); +__device__ int __nv_vmaxu2(int __a, int __b); +__device__ int __nv_vmaxu4(int __a, int __b); +__device__ int __nv_vmins2(int __a, int __b); +__device__ int __nv_vmins4(int __a, int __b); +__device__ int __nv_vminu2(int __a, int __b); +__device__ int __nv_vminu4(int __a, int __b); +__device__ int __nv_vneg2(int __a); +__device__ int __nv_vneg4(int __a); +__device__ int __nv_vnegss2(int __a); +__device__ int __nv_vnegss4(int __a); +__device__ int __nv_vsads2(int __a, int __b); +__device__ int __nv_vsads4(int __a, int __b); +__device__ int __nv_vsadu2(int __a, int __b); +__device__ int __nv_vsadu4(int __a, int __b); +__device__ int __nv_vseteq2(int __a, int __b); +__device__ int __nv_vseteq4(int __a, int __b); +__device__ int __nv_vsetges2(int __a, int __b); +__device__ int __nv_vsetges4(int __a, int __b); +__device__ int __nv_vsetgeu2(int __a, int __b); +__device__ int __nv_vsetgeu4(int __a, int __b); +__device__ int __nv_vsetgts2(int __a, int __b); +__device__ int __nv_vsetgts4(int __a, int __b); +__device__ int __nv_vsetgtu2(int __a, int __b); +__device__ int __nv_vsetgtu4(int __a, int __b); +__device__ int __nv_vsetles2(int __a, int __b); +__device__ int __nv_vsetles4(int __a, int __b); +__device__ int __nv_vsetleu2(int __a, int __b); +__device__ int __nv_vsetleu4(int __a, int __b); +__device__ int __nv_vsetlts2(int __a, int __b); +__device__ int __nv_vsetlts4(int __a, int __b); +__device__ int __nv_vsetltu2(int __a, int __b); +__device__ int __nv_vsetltu4(int __a, int __b); +__device__ int __nv_vsetne2(int __a, int __b); +__device__ int __nv_vsetne4(int __a, int __b); +__device__ int __nv_vsub2(int __a, int __b); +__device__ int __nv_vsub4(int __a, int __b); +__device__ int __nv_vsubss2(int __a, int __b); +__device__ int __nv_vsubss4(int __a, int __b); +__device__ int __nv_vsubus2(int __a, int __b); +__device__ int __nv_vsubus4(int __a, int __b); +#endif // CUDA_VERSION +__device__ double __nv_y0(double __a); +__device__ float __nv_y0f(float __a); +__device__ double __nv_y1(double __a); +__device__ float __nv_y1f(float __a); +__device__ float __nv_ynf(int __a, float __b); +__device__ double __nv_yn(int __a, double __b); +} // extern "C" +#endif // __CLANG_CUDA_LIBDEVICE_DECLARES_H__ diff --git a/c_headers/__clang_cuda_runtime_wrapper.h b/c_headers/__clang_cuda_runtime_wrapper.h index a82a8490f3..09705a273a 100644 --- a/c_headers/__clang_cuda_runtime_wrapper.h +++ b/c_headers/__clang_cuda_runtime_wrapper.h @@ -62,7 +62,7 @@ #include "cuda.h" #if !defined(CUDA_VERSION) #error "cuda.h did not define CUDA_VERSION" -#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9000 +#elif CUDA_VERSION < 7000 || CUDA_VERSION > 9020 #error "Unsupported CUDA version!" #endif @@ -84,6 +84,9 @@ #define __DEVICE_FUNCTIONS_H__ #define __MATH_FUNCTIONS_H__ #define __COMMON_FUNCTIONS_H__ +// device_functions_decls is replaced by __clang_cuda_device_functions.h +// included below. +#define __DEVICE_FUNCTIONS_DECLS_H__ #undef __CUDACC__ #if CUDA_VERSION < 9000 @@ -97,11 +100,17 @@ #include "host_config.h" #include "host_defines.h" +// Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in +// cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the +// functional equivalent of what we need. +#pragma push_macro("nv_weak") +#define nv_weak weak #undef __CUDABE__ #undef __CUDA_LIBDEVICE__ #define __CUDACC__ #include "cuda_runtime.h" +#pragma pop_macro("nv_weak") #undef __CUDACC__ #define __CUDABE__ @@ -137,20 +146,22 @@ inline __host__ double __signbitd(double x) { } #endif -// We need decls for functions in CUDA's libdevice with __device__ -// attribute only. Alas they come either as __host__ __device__ or -// with no attributes at all. To work around that, define __CUDA_RTC__ -// which produces HD variant and undef __host__ which gives us desided -// decls with __device__ attribute. -#pragma push_macro("__host__") -#define __host__ -#define __CUDACC_RTC__ -#include "device_functions_decls.h" -#undef __CUDACC_RTC__ +// CUDA 9.1 no longer provides declarations for libdevice functions, so we need +// to provide our own. +#include <__clang_cuda_libdevice_declares.h> -// Temporarily poison __host__ macro to ensure it's not used by any of -// the headers we're about to include. -#define __host__ UNEXPECTED_HOST_ATTRIBUTE +// Wrappers for many device-side standard library functions became compiler +// builtins in CUDA-9 and have been removed from the CUDA headers. Clang now +// provides its own implementation of the wrappers. +#if CUDA_VERSION >= 9000 +#include <__clang_cuda_device_functions.h> +#endif + +// __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's +// counterpart does not do it, so we need to make it empty here to keep +// following CUDA includes happy. +#undef __THROW +#define __THROW // CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values. // Previous versions used to check whether they are defined or not. @@ -167,24 +178,20 @@ inline __host__ double __signbitd(double x) { #endif #endif +// Temporarily poison __host__ macro to ensure it's not used by any of +// the headers we're about to include. +#pragma push_macro("__host__") +#define __host__ UNEXPECTED_HOST_ATTRIBUTE + // device_functions.hpp and math_functions*.hpp use 'static // __forceinline__' (with no __device__) for definitions of device // functions. Temporarily redefine __forceinline__ to include // __device__. #pragma push_macro("__forceinline__") #define __forceinline__ __device__ __inline__ __attribute__((always_inline)) - -#pragma push_macro("__float2half_rn") -#if CUDA_VERSION >= 9000 -// CUDA-9 has conflicting prototypes for __float2half_rn(float f) in -// cuda_fp16.h[pp] and device_functions.hpp. We need to get the one in -// device_functions.hpp out of the way. -#define __float2half_rn __float2half_rn_disabled -#endif - +#if CUDA_VERSION < 9000 #include "device_functions.hpp" -#pragma pop_macro("__float2half_rn") - +#endif // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we // get the slow-but-accurate or fast-but-inaccurate versions of functions like @@ -196,17 +203,32 @@ inline __host__ double __signbitd(double x) { #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__) #define __USE_FAST_MATH__ 1 #endif + +#if CUDA_VERSION >= 9000 +// CUDA-9.2 needs host-side memcpy for some host functions in +// device_functions.hpp +#if CUDA_VERSION >= 9020 +#include <string.h> +#endif +#include "crt/math_functions.hpp" +#else #include "math_functions.hpp" +#endif + #pragma pop_macro("__USE_FAST_MATH__") +#if CUDA_VERSION < 9000 #include "math_functions_dbl_ptx3.hpp" +#endif #pragma pop_macro("__forceinline__") // Pull in host-only functions that are only available when neither // __CUDACC__ nor __CUDABE__ are defined. #undef __MATH_FUNCTIONS_HPP__ #undef __CUDABE__ +#if CUDA_VERSION < 9000 #include "math_functions.hpp" +#endif // Alas, additional overloads for these functions are hard to get to. // Considering that we only need these overloads for a few functions, // we can provide them here. @@ -222,22 +244,36 @@ static inline float normcdfinv(float __a) { return normcdfinvf(__a); } static inline float normcdf(float __a) { return normcdff(__a); } static inline float erfcx(float __a) { return erfcxf(__a); } +#if CUDA_VERSION < 9000 // For some reason single-argument variant is not always declared by // CUDA headers. Alas, device_functions.hpp included below needs it. static inline __device__ void __brkpt(int __c) { __brkpt(); } +#endif // Now include *.hpp with definitions of various GPU functions. Alas, // a lot of thins get declared/defined with __host__ attribute which // we don't want and we have to define it out. We also have to include // {device,math}_functions.hpp again in order to extract the other // branch of #if/else inside. - #define __host__ #undef __CUDABE__ #define __CUDACC__ +#if CUDA_VERSION >= 9000 +// Some atomic functions became compiler builtins in CUDA-9 , so we need their +// declarations. +#include "device_atomic_functions.h" +#endif #undef __DEVICE_FUNCTIONS_HPP__ #include "device_atomic_functions.hpp" +#if CUDA_VERSION >= 9000 +#include "crt/device_functions.hpp" +#include "crt/device_double_functions.hpp" +#else #include "device_functions.hpp" +#define __CUDABE__ +#include "device_double_functions.h" +#undef __CUDABE__ +#endif #include "sm_20_atomic_functions.hpp" #include "sm_20_intrinsics.hpp" #include "sm_32_atomic_functions.hpp" @@ -251,8 +287,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); } // reason about our code. #if CUDA_VERSION >= 8000 +#pragma push_macro("__CUDA_ARCH__") +#undef __CUDA_ARCH__ #include "sm_60_atomic_functions.hpp" #include "sm_61_intrinsics.hpp" +#pragma pop_macro("__CUDA_ARCH__") #endif #undef __MATH_FUNCTIONS_HPP__ @@ -279,7 +318,11 @@ static inline __device__ void __brkpt(int __c) { __brkpt(); } #endif #endif +#if CUDA_VERSION >= 9000 +#include "crt/math_functions.hpp" +#else #include "math_functions.hpp" +#endif #pragma pop_macro("_GLIBCXX_MATH_H") #pragma pop_macro("_LIBCPP_VERSION") #pragma pop_macro("__GNUC__") diff --git a/c_headers/__wmmintrin_aes.h b/c_headers/__wmmintrin_aes.h index 3a2ee1b2ef..70c355efc4 100644 --- a/c_headers/__wmmintrin_aes.h +++ b/c_headers/__wmmintrin_aes.h @@ -20,15 +20,18 @@ * *===-----------------------------------------------------------------------=== */ -#ifndef _WMMINTRIN_AES_H -#define _WMMINTRIN_AES_H -#include <emmintrin.h> +#ifndef __WMMINTRIN_H +#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead." +#endif + +#ifndef __WMMINTRIN_AES_H +#define __WMMINTRIN_AES_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128))) -/// \brief Performs a single round of AES encryption using the Equivalent +/// Performs a single round of AES encryption using the Equivalent /// Inverse Cipher, transforming the state value from the first source /// operand using a 128-bit round key value contained in the second source /// operand, and writes the result to the destination. @@ -48,7 +51,7 @@ _mm_aesenc_si128(__m128i __V, __m128i __R) return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R); } -/// \brief Performs the final round of AES encryption using the Equivalent +/// Performs the final round of AES encryption using the Equivalent /// Inverse Cipher, transforming the state value from the first source /// operand using a 128-bit round key value contained in the second source /// operand, and writes the result to the destination. @@ -68,7 +71,7 @@ _mm_aesenclast_si128(__m128i __V, __m128i __R) return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R); } -/// \brief Performs a single round of AES decryption using the Equivalent +/// Performs a single round of AES decryption using the Equivalent /// Inverse Cipher, transforming the state value from the first source /// operand using a 128-bit round key value contained in the second source /// operand, and writes the result to the destination. @@ -88,7 +91,7 @@ _mm_aesdec_si128(__m128i __V, __m128i __R) return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R); } -/// \brief Performs the final round of AES decryption using the Equivalent +/// Performs the final round of AES decryption using the Equivalent /// Inverse Cipher, transforming the state value from the first source /// operand using a 128-bit round key value contained in the second source /// operand, and writes the result to the destination. @@ -108,7 +111,7 @@ _mm_aesdeclast_si128(__m128i __V, __m128i __R) return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R); } -/// \brief Applies the AES InvMixColumns() transformation to an expanded key +/// Applies the AES InvMixColumns() transformation to an expanded key /// contained in the source operand, and writes the result to the /// destination. /// @@ -125,7 +128,7 @@ _mm_aesimc_si128(__m128i __V) return (__m128i)__builtin_ia32_aesimc128((__v2di)__V); } -/// \brief Generates a round key for AES encyption, operating on 128-bit data +/// Generates a round key for AES encryption, operating on 128-bit data /// specified in the first source operand and using an 8-bit round constant /// specified by the second source operand, and writes the result to the /// destination. @@ -148,4 +151,4 @@ _mm_aesimc_si128(__m128i __V) #undef __DEFAULT_FN_ATTRS -#endif /* _WMMINTRIN_AES_H */ +#endif /* __WMMINTRIN_AES_H */ diff --git a/c_headers/__wmmintrin_pclmul.h b/c_headers/__wmmintrin_pclmul.h index e9c6a9f6d4..e0f928796a 100644 --- a/c_headers/__wmmintrin_pclmul.h +++ b/c_headers/__wmmintrin_pclmul.h @@ -20,10 +20,15 @@ * *===-----------------------------------------------------------------------=== */ -#ifndef _WMMINTRIN_PCLMUL_H -#define _WMMINTRIN_PCLMUL_H -/// \brief Multiplies two 64-bit integer values, which are selected from source +#ifndef __WMMINTRIN_H +#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead." +#endif + +#ifndef __WMMINTRIN_PCLMUL_H +#define __WMMINTRIN_PCLMUL_H + +/// Multiplies two 64-bit integer values, which are selected from source /// operands using the immediate-value operand. The multiplication is a /// carry-less multiplication, and the 128-bit integer product is stored in /// the destination. @@ -50,8 +55,8 @@ /// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used. /// \returns The 128-bit integer vector containing the result of the carry-less /// multiplication of the selected 64-bit values. -#define _mm_clmulepi64_si128(__X, __Y, __I) \ - ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(__X), \ - (__v2di)(__m128i)(__Y), (char)(__I))) +#define _mm_clmulepi64_si128(X, Y, I) \ + ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (char)(I))) -#endif /* _WMMINTRIN_PCLMUL_H */ +#endif /* __WMMINTRIN_PCLMUL_H */ diff --git a/c_headers/ammintrin.h b/c_headers/ammintrin.h index 2843a7a267..680b4465ea 100644 --- a/c_headers/ammintrin.h +++ b/c_headers/ammintrin.h @@ -27,9 +27,9 @@ #include <pmmintrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128))) -/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit +/// Extracts the specified bits from the lower 64 bits of the 128-bit /// integer vector operand at the index \a idx and of the length \a len. /// /// \headerfile <x86intrin.h> @@ -57,7 +57,7 @@ ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \ (char)(len), (char)(idx))) -/// \brief Extracts the specified bits from the lower 64 bits of the 128-bit +/// Extracts the specified bits from the lower 64 bits of the 128-bit /// integer vector operand at the index and of the length specified by /// \a __y. /// @@ -82,7 +82,7 @@ _mm_extract_si64(__m128i __x, __m128i __y) return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y); } -/// \brief Inserts bits of a specified length from the source integer vector +/// Inserts bits of a specified length from the source integer vector /// \a y into the lower 64 bits of the destination integer vector \a x at /// the index \a idx and of the length \a len. /// @@ -120,7 +120,7 @@ _mm_extract_si64(__m128i __x, __m128i __y) (__v2di)(__m128i)(y), \ (char)(len), (char)(idx))) -/// \brief Inserts bits of a specified length from the source integer vector +/// Inserts bits of a specified length from the source integer vector /// \a __y into the lower 64 bits of the destination integer vector \a __x /// at the index and of the length specified by \a __y. /// @@ -152,7 +152,7 @@ _mm_insert_si64(__m128i __x, __m128i __y) return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y); } -/// \brief Stores a 64-bit double-precision value in a 64-bit memory location. +/// Stores a 64-bit double-precision value in a 64-bit memory location. /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). /// @@ -170,7 +170,7 @@ _mm_stream_sd(double *__p, __m128d __a) __builtin_ia32_movntsd(__p, (__v2df)__a); } -/// \brief Stores a 32-bit single-precision floating-point value in a 32-bit +/// Stores a 32-bit single-precision floating-point value in a 32-bit /// memory location. To minimize caching, the data is flagged as /// non-temporal (unlikely to be used again soon). /// diff --git a/c_headers/arm_fp16.h b/c_headers/arm_fp16.h new file mode 100644 index 0000000000..45ff14f346 --- /dev/null +++ b/c_headers/arm_fp16.h @@ -0,0 +1,1499 @@ +/*===---- arm_fp16.h - ARM FP16 intrinsics ---------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __ARM_FP16_H +#define __ARM_FP16_H + +#include <stdint.h> + +typedef __fp16 float16_t; +#define __ai static inline __attribute__((__always_inline__, __nodebug__)) + +#if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +#define vabdh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vabdh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vabdh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vabsh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \ + __ret; \ +}) +#else +#define vabsh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vabsh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vaddh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vaddh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vaddh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcageh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcageh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcageh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcagth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcagth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcagth_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcaleh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcaleh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcaleh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcalth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcalth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcalth_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vceqh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vceqh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vceqh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vceqzh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \ + __ret; \ +}) +#else +#define vceqzh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vceqzh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcgeh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcgeh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgeh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcgezh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \ + __ret; \ +}) +#else +#define vcgezh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgezh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcgth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcgth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgth_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcgtzh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \ + __ret; \ +}) +#else +#define vcgtzh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcgtzh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcleh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vcleh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcleh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vclezh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \ + __ret; \ +}) +#else +#define vclezh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vclezh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vclth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vclth_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vclth_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcltzh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \ + __ret; \ +}) +#else +#define vcltzh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcltzh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_s16_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvth_n_s16_f16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_s32_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvth_n_s32_f16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_s64_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvth_n_s64_f16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_u16_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvth_n_u16_f16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_u32_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvth_n_u32_f16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_u64_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvth_n_u64_f16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvth_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvth_s16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvth_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvth_s32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvth_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvth_s64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvth_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvth_u16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvth_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvth_u32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvth_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvth_u64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtah_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtah_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtah_s16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtah_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtah_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtah_s32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtah_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtah_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtah_s64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtah_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtah_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtah_u16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtah_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtah_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtah_u32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtah_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtah_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtah_u64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16_t vcvth_f16_u32(uint32_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__p0); + return __ret; +} +#else +__ai float16_t vcvth_f16_u32(uint32_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_u32(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16_t vcvth_f16_u64(uint64_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__p0); + return __ret; +} +#else +__ai float16_t vcvth_f16_u64(uint64_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_u64(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16_t vcvth_f16_u16(uint16_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__p0); + return __ret; +} +#else +__ai float16_t vcvth_f16_u16(uint16_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_u16(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16_t vcvth_f16_s32(int32_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__p0); + return __ret; +} +#else +__ai float16_t vcvth_f16_s32(int32_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_s32(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16_t vcvth_f16_s64(int64_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__p0); + return __ret; +} +#else +__ai float16_t vcvth_f16_s64(int64_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_s64(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16_t vcvth_f16_s16(int16_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__p0); + return __ret; +} +#else +__ai float16_t vcvth_f16_s16(int16_t __p0) { + float16_t __ret; + __ret = (float16_t) __builtin_neon_vcvth_f16_s16(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_f16_u32(__p0, __p1) __extension__ ({ \ + uint32_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_u32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_f16_u64(__p0, __p1) __extension__ ({ \ + uint64_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_u64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_f16_u16(__p0, __p1) __extension__ ({ \ + uint16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_u16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_f16_s32(__p0, __p1) __extension__ ({ \ + int32_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_s32(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_f16_s64(__p0, __p1) __extension__ ({ \ + int64_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_s64(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \ + __ret; \ +}) +#else +#define vcvth_n_f16_s16(__p0, __p1) __extension__ ({ \ + int16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vcvth_n_f16_s16(__s0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtmh_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtmh_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtmh_s16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtmh_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtmh_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtmh_s32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtmh_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtmh_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtmh_s64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtmh_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtmh_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtmh_u16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtmh_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtmh_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtmh_u32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtmh_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtmh_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtmh_u64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtnh_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtnh_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtnh_s16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtnh_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtnh_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtnh_s32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtnh_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtnh_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtnh_s64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtnh_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtnh_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtnh_u16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtnh_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtnh_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtnh_u32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtnh_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtnh_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtnh_u64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtph_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtph_s16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int16_t __ret; \ + __ret = (int16_t) __builtin_neon_vcvtph_s16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtph_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtph_s32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int32_t __ret; \ + __ret = (int32_t) __builtin_neon_vcvtph_s32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtph_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtph_s64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + int64_t __ret; \ + __ret = (int64_t) __builtin_neon_vcvtph_s64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtph_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtph_u16_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint16_t __ret; \ + __ret = (uint16_t) __builtin_neon_vcvtph_u16_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtph_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtph_u32_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint32_t __ret; \ + __ret = (uint32_t) __builtin_neon_vcvtph_u32_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vcvtph_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \ + __ret; \ +}) +#else +#define vcvtph_u64_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + uint64_t __ret; \ + __ret = (uint64_t) __builtin_neon_vcvtph_u64_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdivh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vdivh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vdivh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \ + __ret; \ +}) +#else +#define vfmah_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmah_f16(__s0, __s1, __s2); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \ + __ret; \ +}) +#else +#define vfmsh_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __s2 = __p2; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vfmsh_f16(__s0, __s1, __s2); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vmaxh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vmaxnmh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmaxnmh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vminh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vminnmh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vminnmh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vminnmh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vmulh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulxh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vmulxh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulxh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vnegh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \ + __ret; \ +}) +#else +#define vnegh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vnegh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrecpeh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \ + __ret; \ +}) +#else +#define vrecpeh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrecpeh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrecpsh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vrecpsh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrecpsh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrecpxh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \ + __ret; \ +}) +#else +#define vrecpxh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrecpxh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \ + __ret; \ +}) +#else +#define vrndh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndah_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \ + __ret; \ +}) +#else +#define vrndah_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndah_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndih_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \ + __ret; \ +}) +#else +#define vrndih_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndih_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndmh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \ + __ret; \ +}) +#else +#define vrndmh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndmh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndnh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \ + __ret; \ +}) +#else +#define vrndnh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndnh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndph_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \ + __ret; \ +}) +#else +#define vrndph_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndph_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrndxh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \ + __ret; \ +}) +#else +#define vrndxh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrndxh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrsqrteh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \ + __ret; \ +}) +#else +#define vrsqrteh_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrsqrteh_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vrsqrtsh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vrsqrtsh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vsqrth_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \ + __ret; \ +}) +#else +#define vsqrth_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vsqrth_f16(__s0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vsubh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \ + __ret; \ +}) +#else +#define vsubh_f16(__p0, __p1) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vsubh_f16(__s0, __s1); \ + __ret; \ +}) +#endif + +#endif + +#undef __ai + +#endif /* __ARM_FP16_H */ diff --git a/c_headers/arm_neon.h b/c_headers/arm_neon.h index 3da63d994d..e0efa76904 100644 --- a/c_headers/arm_neon.h +++ b/c_headers/arm_neon.h @@ -8587,6 +8587,1278 @@ __ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vld1_p8_x2(__p0) __extension__ ({ \ + poly8x8x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \ + __ret; \ +}) +#else +#define vld1_p8_x2(__p0) __extension__ ({ \ + poly8x8x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_p16_x2(__p0) __extension__ ({ \ + poly16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \ + __ret; \ +}) +#else +#define vld1_p16_x2(__p0) __extension__ ({ \ + poly16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_p8_x2(__p0) __extension__ ({ \ + poly8x16x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \ + __ret; \ +}) +#else +#define vld1q_p8_x2(__p0) __extension__ ({ \ + poly8x16x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_p16_x2(__p0) __extension__ ({ \ + poly16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \ + __ret; \ +}) +#else +#define vld1q_p16_x2(__p0) __extension__ ({ \ + poly16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u8_x2(__p0) __extension__ ({ \ + uint8x16x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \ + __ret; \ +}) +#else +#define vld1q_u8_x2(__p0) __extension__ ({ \ + uint8x16x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u32_x2(__p0) __extension__ ({ \ + uint32x4x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \ + __ret; \ +}) +#else +#define vld1q_u32_x2(__p0) __extension__ ({ \ + uint32x4x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u64_x2(__p0) __extension__ ({ \ + uint64x2x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \ + __ret; \ +}) +#else +#define vld1q_u64_x2(__p0) __extension__ ({ \ + uint64x2x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u16_x2(__p0) __extension__ ({ \ + uint16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \ + __ret; \ +}) +#else +#define vld1q_u16_x2(__p0) __extension__ ({ \ + uint16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s8_x2(__p0) __extension__ ({ \ + int8x16x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \ + __ret; \ +}) +#else +#define vld1q_s8_x2(__p0) __extension__ ({ \ + int8x16x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_f32_x2(__p0) __extension__ ({ \ + float32x4x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \ + __ret; \ +}) +#else +#define vld1q_f32_x2(__p0) __extension__ ({ \ + float32x4x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_f16_x2(__p0) __extension__ ({ \ + float16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \ + __ret; \ +}) +#else +#define vld1q_f16_x2(__p0) __extension__ ({ \ + float16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s32_x2(__p0) __extension__ ({ \ + int32x4x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \ + __ret; \ +}) +#else +#define vld1q_s32_x2(__p0) __extension__ ({ \ + int32x4x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s64_x2(__p0) __extension__ ({ \ + int64x2x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \ + __ret; \ +}) +#else +#define vld1q_s64_x2(__p0) __extension__ ({ \ + int64x2x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s16_x2(__p0) __extension__ ({ \ + int16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \ + __ret; \ +}) +#else +#define vld1q_s16_x2(__p0) __extension__ ({ \ + int16x8x2_t __ret; \ + __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u8_x2(__p0) __extension__ ({ \ + uint8x8x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \ + __ret; \ +}) +#else +#define vld1_u8_x2(__p0) __extension__ ({ \ + uint8x8x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u32_x2(__p0) __extension__ ({ \ + uint32x2x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \ + __ret; \ +}) +#else +#define vld1_u32_x2(__p0) __extension__ ({ \ + uint32x2x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u64_x2(__p0) __extension__ ({ \ + uint64x1x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \ + __ret; \ +}) +#else +#define vld1_u64_x2(__p0) __extension__ ({ \ + uint64x1x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u16_x2(__p0) __extension__ ({ \ + uint16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \ + __ret; \ +}) +#else +#define vld1_u16_x2(__p0) __extension__ ({ \ + uint16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s8_x2(__p0) __extension__ ({ \ + int8x8x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \ + __ret; \ +}) +#else +#define vld1_s8_x2(__p0) __extension__ ({ \ + int8x8x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_f32_x2(__p0) __extension__ ({ \ + float32x2x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \ + __ret; \ +}) +#else +#define vld1_f32_x2(__p0) __extension__ ({ \ + float32x2x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_f16_x2(__p0) __extension__ ({ \ + float16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \ + __ret; \ +}) +#else +#define vld1_f16_x2(__p0) __extension__ ({ \ + float16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s32_x2(__p0) __extension__ ({ \ + int32x2x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \ + __ret; \ +}) +#else +#define vld1_s32_x2(__p0) __extension__ ({ \ + int32x2x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s64_x2(__p0) __extension__ ({ \ + int64x1x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \ + __ret; \ +}) +#else +#define vld1_s64_x2(__p0) __extension__ ({ \ + int64x1x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s16_x2(__p0) __extension__ ({ \ + int16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \ + __ret; \ +}) +#else +#define vld1_s16_x2(__p0) __extension__ ({ \ + int16x4x2_t __ret; \ + __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_p8_x3(__p0) __extension__ ({ \ + poly8x8x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \ + __ret; \ +}) +#else +#define vld1_p8_x3(__p0) __extension__ ({ \ + poly8x8x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_p16_x3(__p0) __extension__ ({ \ + poly16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \ + __ret; \ +}) +#else +#define vld1_p16_x3(__p0) __extension__ ({ \ + poly16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_p8_x3(__p0) __extension__ ({ \ + poly8x16x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \ + __ret; \ +}) +#else +#define vld1q_p8_x3(__p0) __extension__ ({ \ + poly8x16x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_p16_x3(__p0) __extension__ ({ \ + poly16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \ + __ret; \ +}) +#else +#define vld1q_p16_x3(__p0) __extension__ ({ \ + poly16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u8_x3(__p0) __extension__ ({ \ + uint8x16x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \ + __ret; \ +}) +#else +#define vld1q_u8_x3(__p0) __extension__ ({ \ + uint8x16x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u32_x3(__p0) __extension__ ({ \ + uint32x4x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \ + __ret; \ +}) +#else +#define vld1q_u32_x3(__p0) __extension__ ({ \ + uint32x4x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u64_x3(__p0) __extension__ ({ \ + uint64x2x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \ + __ret; \ +}) +#else +#define vld1q_u64_x3(__p0) __extension__ ({ \ + uint64x2x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u16_x3(__p0) __extension__ ({ \ + uint16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \ + __ret; \ +}) +#else +#define vld1q_u16_x3(__p0) __extension__ ({ \ + uint16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s8_x3(__p0) __extension__ ({ \ + int8x16x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \ + __ret; \ +}) +#else +#define vld1q_s8_x3(__p0) __extension__ ({ \ + int8x16x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_f32_x3(__p0) __extension__ ({ \ + float32x4x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \ + __ret; \ +}) +#else +#define vld1q_f32_x3(__p0) __extension__ ({ \ + float32x4x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_f16_x3(__p0) __extension__ ({ \ + float16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \ + __ret; \ +}) +#else +#define vld1q_f16_x3(__p0) __extension__ ({ \ + float16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s32_x3(__p0) __extension__ ({ \ + int32x4x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \ + __ret; \ +}) +#else +#define vld1q_s32_x3(__p0) __extension__ ({ \ + int32x4x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s64_x3(__p0) __extension__ ({ \ + int64x2x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \ + __ret; \ +}) +#else +#define vld1q_s64_x3(__p0) __extension__ ({ \ + int64x2x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s16_x3(__p0) __extension__ ({ \ + int16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \ + __ret; \ +}) +#else +#define vld1q_s16_x3(__p0) __extension__ ({ \ + int16x8x3_t __ret; \ + __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u8_x3(__p0) __extension__ ({ \ + uint8x8x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \ + __ret; \ +}) +#else +#define vld1_u8_x3(__p0) __extension__ ({ \ + uint8x8x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u32_x3(__p0) __extension__ ({ \ + uint32x2x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \ + __ret; \ +}) +#else +#define vld1_u32_x3(__p0) __extension__ ({ \ + uint32x2x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u64_x3(__p0) __extension__ ({ \ + uint64x1x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \ + __ret; \ +}) +#else +#define vld1_u64_x3(__p0) __extension__ ({ \ + uint64x1x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u16_x3(__p0) __extension__ ({ \ + uint16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \ + __ret; \ +}) +#else +#define vld1_u16_x3(__p0) __extension__ ({ \ + uint16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s8_x3(__p0) __extension__ ({ \ + int8x8x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \ + __ret; \ +}) +#else +#define vld1_s8_x3(__p0) __extension__ ({ \ + int8x8x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_f32_x3(__p0) __extension__ ({ \ + float32x2x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \ + __ret; \ +}) +#else +#define vld1_f32_x3(__p0) __extension__ ({ \ + float32x2x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_f16_x3(__p0) __extension__ ({ \ + float16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \ + __ret; \ +}) +#else +#define vld1_f16_x3(__p0) __extension__ ({ \ + float16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s32_x3(__p0) __extension__ ({ \ + int32x2x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \ + __ret; \ +}) +#else +#define vld1_s32_x3(__p0) __extension__ ({ \ + int32x2x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s64_x3(__p0) __extension__ ({ \ + int64x1x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \ + __ret; \ +}) +#else +#define vld1_s64_x3(__p0) __extension__ ({ \ + int64x1x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s16_x3(__p0) __extension__ ({ \ + int16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \ + __ret; \ +}) +#else +#define vld1_s16_x3(__p0) __extension__ ({ \ + int16x4x3_t __ret; \ + __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_p8_x4(__p0) __extension__ ({ \ + poly8x8x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \ + __ret; \ +}) +#else +#define vld1_p8_x4(__p0) __extension__ ({ \ + poly8x8x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_p16_x4(__p0) __extension__ ({ \ + poly16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \ + __ret; \ +}) +#else +#define vld1_p16_x4(__p0) __extension__ ({ \ + poly16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_p8_x4(__p0) __extension__ ({ \ + poly8x16x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \ + __ret; \ +}) +#else +#define vld1q_p8_x4(__p0) __extension__ ({ \ + poly8x16x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_p16_x4(__p0) __extension__ ({ \ + poly16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \ + __ret; \ +}) +#else +#define vld1q_p16_x4(__p0) __extension__ ({ \ + poly16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u8_x4(__p0) __extension__ ({ \ + uint8x16x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \ + __ret; \ +}) +#else +#define vld1q_u8_x4(__p0) __extension__ ({ \ + uint8x16x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u32_x4(__p0) __extension__ ({ \ + uint32x4x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \ + __ret; \ +}) +#else +#define vld1q_u32_x4(__p0) __extension__ ({ \ + uint32x4x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u64_x4(__p0) __extension__ ({ \ + uint64x2x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \ + __ret; \ +}) +#else +#define vld1q_u64_x4(__p0) __extension__ ({ \ + uint64x2x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_u16_x4(__p0) __extension__ ({ \ + uint16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \ + __ret; \ +}) +#else +#define vld1q_u16_x4(__p0) __extension__ ({ \ + uint16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s8_x4(__p0) __extension__ ({ \ + int8x16x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \ + __ret; \ +}) +#else +#define vld1q_s8_x4(__p0) __extension__ ({ \ + int8x16x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_f32_x4(__p0) __extension__ ({ \ + float32x4x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \ + __ret; \ +}) +#else +#define vld1q_f32_x4(__p0) __extension__ ({ \ + float32x4x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_f16_x4(__p0) __extension__ ({ \ + float16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \ + __ret; \ +}) +#else +#define vld1q_f16_x4(__p0) __extension__ ({ \ + float16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s32_x4(__p0) __extension__ ({ \ + int32x4x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \ + __ret; \ +}) +#else +#define vld1q_s32_x4(__p0) __extension__ ({ \ + int32x4x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s64_x4(__p0) __extension__ ({ \ + int64x2x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \ + __ret; \ +}) +#else +#define vld1q_s64_x4(__p0) __extension__ ({ \ + int64x2x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1q_s16_x4(__p0) __extension__ ({ \ + int16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \ + __ret; \ +}) +#else +#define vld1q_s16_x4(__p0) __extension__ ({ \ + int16x8x4_t __ret; \ + __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u8_x4(__p0) __extension__ ({ \ + uint8x8x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \ + __ret; \ +}) +#else +#define vld1_u8_x4(__p0) __extension__ ({ \ + uint8x8x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u32_x4(__p0) __extension__ ({ \ + uint32x2x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \ + __ret; \ +}) +#else +#define vld1_u32_x4(__p0) __extension__ ({ \ + uint32x2x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u64_x4(__p0) __extension__ ({ \ + uint64x1x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \ + __ret; \ +}) +#else +#define vld1_u64_x4(__p0) __extension__ ({ \ + uint64x1x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_u16_x4(__p0) __extension__ ({ \ + uint16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \ + __ret; \ +}) +#else +#define vld1_u16_x4(__p0) __extension__ ({ \ + uint16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s8_x4(__p0) __extension__ ({ \ + int8x8x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \ + __ret; \ +}) +#else +#define vld1_s8_x4(__p0) __extension__ ({ \ + int8x8x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_f32_x4(__p0) __extension__ ({ \ + float32x2x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \ + __ret; \ +}) +#else +#define vld1_f32_x4(__p0) __extension__ ({ \ + float32x2x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_f16_x4(__p0) __extension__ ({ \ + float16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \ + __ret; \ +}) +#else +#define vld1_f16_x4(__p0) __extension__ ({ \ + float16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s32_x4(__p0) __extension__ ({ \ + int32x2x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \ + __ret; \ +}) +#else +#define vld1_s32_x4(__p0) __extension__ ({ \ + int32x2x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s64_x4(__p0) __extension__ ({ \ + int64x1x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \ + __ret; \ +}) +#else +#define vld1_s64_x4(__p0) __extension__ ({ \ + int64x1x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld1_s16_x4(__p0) __extension__ ({ \ + int16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \ + __ret; \ +}) +#else +#define vld1_s16_x4(__p0) __extension__ ({ \ + int16x4x4_t __ret; \ + __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vld2_p8(__p0) __extension__ ({ \ poly8x8x2_t __ret; \ __builtin_neon_vld2_v(&__ret, __p0, 4); \ @@ -8989,6 +10261,210 @@ __ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_p8(__p0) __extension__ ({ \ + poly8x16x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \ + __ret; \ +}) +#else +#define vld2q_dup_p8(__p0) __extension__ ({ \ + poly8x16x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_p16(__p0) __extension__ ({ \ + poly16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \ + __ret; \ +}) +#else +#define vld2q_dup_p16(__p0) __extension__ ({ \ + poly16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_u8(__p0) __extension__ ({ \ + uint8x16x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \ + __ret; \ +}) +#else +#define vld2q_dup_u8(__p0) __extension__ ({ \ + uint8x16x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_u32(__p0) __extension__ ({ \ + uint32x4x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \ + __ret; \ +}) +#else +#define vld2q_dup_u32(__p0) __extension__ ({ \ + uint32x4x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_u64(__p0) __extension__ ({ \ + uint64x2x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \ + __ret; \ +}) +#else +#define vld2q_dup_u64(__p0) __extension__ ({ \ + uint64x2x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_u16(__p0) __extension__ ({ \ + uint16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \ + __ret; \ +}) +#else +#define vld2q_dup_u16(__p0) __extension__ ({ \ + uint16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_s8(__p0) __extension__ ({ \ + int8x16x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \ + __ret; \ +}) +#else +#define vld2q_dup_s8(__p0) __extension__ ({ \ + int8x16x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_f32(__p0) __extension__ ({ \ + float32x4x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \ + __ret; \ +}) +#else +#define vld2q_dup_f32(__p0) __extension__ ({ \ + float32x4x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_f16(__p0) __extension__ ({ \ + float16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \ + __ret; \ +}) +#else +#define vld2q_dup_f16(__p0) __extension__ ({ \ + float16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_s32(__p0) __extension__ ({ \ + int32x4x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \ + __ret; \ +}) +#else +#define vld2q_dup_s32(__p0) __extension__ ({ \ + int32x4x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_s64(__p0) __extension__ ({ \ + int64x2x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \ + __ret; \ +}) +#else +#define vld2q_dup_s64(__p0) __extension__ ({ \ + int64x2x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld2q_dup_s16(__p0) __extension__ ({ \ + int16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \ + __ret; \ +}) +#else +#define vld2q_dup_s16(__p0) __extension__ ({ \ + int16x8x2_t __ret; \ + __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vld2_dup_u8(__p0) __extension__ ({ \ uint8x8x2_t __ret; \ __builtin_neon_vld2_dup_v(&__ret, __p0, 16); \ @@ -9951,6 +11427,222 @@ __ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_p8(__p0) __extension__ ({ \ + poly8x16x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \ + __ret; \ +}) +#else +#define vld3q_dup_p8(__p0) __extension__ ({ \ + poly8x16x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_p16(__p0) __extension__ ({ \ + poly16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \ + __ret; \ +}) +#else +#define vld3q_dup_p16(__p0) __extension__ ({ \ + poly16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_u8(__p0) __extension__ ({ \ + uint8x16x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \ + __ret; \ +}) +#else +#define vld3q_dup_u8(__p0) __extension__ ({ \ + uint8x16x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_u32(__p0) __extension__ ({ \ + uint32x4x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \ + __ret; \ +}) +#else +#define vld3q_dup_u32(__p0) __extension__ ({ \ + uint32x4x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_u64(__p0) __extension__ ({ \ + uint64x2x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \ + __ret; \ +}) +#else +#define vld3q_dup_u64(__p0) __extension__ ({ \ + uint64x2x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_u16(__p0) __extension__ ({ \ + uint16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \ + __ret; \ +}) +#else +#define vld3q_dup_u16(__p0) __extension__ ({ \ + uint16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_s8(__p0) __extension__ ({ \ + int8x16x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \ + __ret; \ +}) +#else +#define vld3q_dup_s8(__p0) __extension__ ({ \ + int8x16x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_f32(__p0) __extension__ ({ \ + float32x4x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \ + __ret; \ +}) +#else +#define vld3q_dup_f32(__p0) __extension__ ({ \ + float32x4x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_f16(__p0) __extension__ ({ \ + float16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \ + __ret; \ +}) +#else +#define vld3q_dup_f16(__p0) __extension__ ({ \ + float16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_s32(__p0) __extension__ ({ \ + int32x4x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \ + __ret; \ +}) +#else +#define vld3q_dup_s32(__p0) __extension__ ({ \ + int32x4x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_s64(__p0) __extension__ ({ \ + int64x2x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \ + __ret; \ +}) +#else +#define vld3q_dup_s64(__p0) __extension__ ({ \ + int64x2x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld3q_dup_s16(__p0) __extension__ ({ \ + int16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \ + __ret; \ +}) +#else +#define vld3q_dup_s16(__p0) __extension__ ({ \ + int16x8x3_t __ret; \ + __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vld3_dup_u8(__p0) __extension__ ({ \ uint8x8x3_t __ret; \ __builtin_neon_vld3_dup_v(&__ret, __p0, 16); \ @@ -10977,6 +12669,234 @@ __ai int16x4_t vhsub_s16(int16x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_p8(__p0) __extension__ ({ \ + poly8x16x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \ + __ret; \ +}) +#else +#define vld4q_dup_p8(__p0) __extension__ ({ \ + poly8x16x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_p16(__p0) __extension__ ({ \ + poly16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \ + __ret; \ +}) +#else +#define vld4q_dup_p16(__p0) __extension__ ({ \ + poly16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_u8(__p0) __extension__ ({ \ + uint8x16x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \ + __ret; \ +}) +#else +#define vld4q_dup_u8(__p0) __extension__ ({ \ + uint8x16x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_u32(__p0) __extension__ ({ \ + uint32x4x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \ + __ret; \ +}) +#else +#define vld4q_dup_u32(__p0) __extension__ ({ \ + uint32x4x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_u64(__p0) __extension__ ({ \ + uint64x2x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \ + __ret; \ +}) +#else +#define vld4q_dup_u64(__p0) __extension__ ({ \ + uint64x2x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_u16(__p0) __extension__ ({ \ + uint16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \ + __ret; \ +}) +#else +#define vld4q_dup_u16(__p0) __extension__ ({ \ + uint16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_s8(__p0) __extension__ ({ \ + int8x16x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \ + __ret; \ +}) +#else +#define vld4q_dup_s8(__p0) __extension__ ({ \ + int8x16x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_f32(__p0) __extension__ ({ \ + float32x4x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \ + __ret; \ +}) +#else +#define vld4q_dup_f32(__p0) __extension__ ({ \ + float32x4x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_f16(__p0) __extension__ ({ \ + float16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \ + __ret; \ +}) +#else +#define vld4q_dup_f16(__p0) __extension__ ({ \ + float16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_s32(__p0) __extension__ ({ \ + int32x4x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \ + __ret; \ +}) +#else +#define vld4q_dup_s32(__p0) __extension__ ({ \ + int32x4x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_s64(__p0) __extension__ ({ \ + int64x2x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \ + __ret; \ +}) +#else +#define vld4q_dup_s64(__p0) __extension__ ({ \ + int64x2x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vld4q_dup_s16(__p0) __extension__ ({ \ + int16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \ + __ret; \ +}) +#else +#define vld4q_dup_s16(__p0) __extension__ ({ \ + int16x8x4_t __ret; \ + __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \ + \ + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vld4_dup_u8(__p0) __extension__ ({ \ uint8x8x4_t __ret; \ __builtin_neon_vld4_dup_v(&__ret, __p0, 16); \ @@ -25581,6 +27501,1134 @@ __ai int16x4_t vshl_s16(int16x4_t __p0, int16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vst1_p8_x2(__p0, __p1) __extension__ ({ \ + poly8x8x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \ +}) +#else +#define vst1_p8_x2(__p0, __p1) __extension__ ({ \ + poly8x8x2_t __s1 = __p1; \ + poly8x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_p16_x2(__p0, __p1) __extension__ ({ \ + poly16x4x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \ +}) +#else +#define vst1_p16_x2(__p0, __p1) __extension__ ({ \ + poly16x4x2_t __s1 = __p1; \ + poly16x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \ + poly8x16x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \ +}) +#else +#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \ + poly8x16x2_t __s1 = __p1; \ + poly8x16x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \ + poly16x8x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \ +}) +#else +#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \ + poly16x8x2_t __s1 = __p1; \ + poly16x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \ + uint8x16x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \ +}) +#else +#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \ + uint8x16x2_t __s1 = __p1; \ + uint8x16x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \ + uint32x4x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \ +}) +#else +#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \ + uint32x4x2_t __s1 = __p1; \ + uint32x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \ + uint64x2x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \ +}) +#else +#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \ + uint64x2x2_t __s1 = __p1; \ + uint64x2x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \ + uint16x8x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \ +}) +#else +#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \ + uint16x8x2_t __s1 = __p1; \ + uint16x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \ + int8x16x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \ +}) +#else +#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \ + int8x16x2_t __s1 = __p1; \ + int8x16x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \ + float32x4x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 41); \ +}) +#else +#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \ + float32x4x2_t __s1 = __p1; \ + float32x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 41); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \ + float16x8x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 40); \ +}) +#else +#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \ + float16x8x2_t __s1 = __p1; \ + float16x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 40); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \ + int32x4x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 34); \ +}) +#else +#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \ + int32x4x2_t __s1 = __p1; \ + int32x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 34); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \ + int64x2x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 35); \ +}) +#else +#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \ + int64x2x2_t __s1 = __p1; \ + int64x2x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 35); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \ + int16x8x2_t __s1 = __p1; \ + __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 33); \ +}) +#else +#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \ + int16x8x2_t __s1 = __p1; \ + int16x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 33); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u8_x2(__p0, __p1) __extension__ ({ \ + uint8x8x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \ +}) +#else +#define vst1_u8_x2(__p0, __p1) __extension__ ({ \ + uint8x8x2_t __s1 = __p1; \ + uint8x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u32_x2(__p0, __p1) __extension__ ({ \ + uint32x2x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \ +}) +#else +#define vst1_u32_x2(__p0, __p1) __extension__ ({ \ + uint32x2x2_t __s1 = __p1; \ + uint32x2x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u64_x2(__p0, __p1) __extension__ ({ \ + uint64x1x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \ +}) +#else +#define vst1_u64_x2(__p0, __p1) __extension__ ({ \ + uint64x1x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u16_x2(__p0, __p1) __extension__ ({ \ + uint16x4x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \ +}) +#else +#define vst1_u16_x2(__p0, __p1) __extension__ ({ \ + uint16x4x2_t __s1 = __p1; \ + uint16x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s8_x2(__p0, __p1) __extension__ ({ \ + int8x8x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \ +}) +#else +#define vst1_s8_x2(__p0, __p1) __extension__ ({ \ + int8x8x2_t __s1 = __p1; \ + int8x8x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_f32_x2(__p0, __p1) __extension__ ({ \ + float32x2x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 9); \ +}) +#else +#define vst1_f32_x2(__p0, __p1) __extension__ ({ \ + float32x2x2_t __s1 = __p1; \ + float32x2x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_f16_x2(__p0, __p1) __extension__ ({ \ + float16x4x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 8); \ +}) +#else +#define vst1_f16_x2(__p0, __p1) __extension__ ({ \ + float16x4x2_t __s1 = __p1; \ + float16x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s32_x2(__p0, __p1) __extension__ ({ \ + int32x2x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 2); \ +}) +#else +#define vst1_s32_x2(__p0, __p1) __extension__ ({ \ + int32x2x2_t __s1 = __p1; \ + int32x2x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s64_x2(__p0, __p1) __extension__ ({ \ + int64x1x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \ +}) +#else +#define vst1_s64_x2(__p0, __p1) __extension__ ({ \ + int64x1x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s16_x2(__p0, __p1) __extension__ ({ \ + int16x4x2_t __s1 = __p1; \ + __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 1); \ +}) +#else +#define vst1_s16_x2(__p0, __p1) __extension__ ({ \ + int16x4x2_t __s1 = __p1; \ + int16x4x2_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_p8_x3(__p0, __p1) __extension__ ({ \ + poly8x8x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \ +}) +#else +#define vst1_p8_x3(__p0, __p1) __extension__ ({ \ + poly8x8x3_t __s1 = __p1; \ + poly8x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_p16_x3(__p0, __p1) __extension__ ({ \ + poly16x4x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \ +}) +#else +#define vst1_p16_x3(__p0, __p1) __extension__ ({ \ + poly16x4x3_t __s1 = __p1; \ + poly16x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \ + poly8x16x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \ +}) +#else +#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \ + poly8x16x3_t __s1 = __p1; \ + poly8x16x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \ + poly16x8x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \ +}) +#else +#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \ + poly16x8x3_t __s1 = __p1; \ + poly16x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \ + uint8x16x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \ +}) +#else +#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \ + uint8x16x3_t __s1 = __p1; \ + uint8x16x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \ + uint32x4x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \ +}) +#else +#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \ + uint32x4x3_t __s1 = __p1; \ + uint32x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \ + uint64x2x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \ +}) +#else +#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \ + uint64x2x3_t __s1 = __p1; \ + uint64x2x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \ + uint16x8x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \ +}) +#else +#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \ + uint16x8x3_t __s1 = __p1; \ + uint16x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \ + int8x16x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \ +}) +#else +#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \ + int8x16x3_t __s1 = __p1; \ + int8x16x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \ + float32x4x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \ +}) +#else +#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \ + float32x4x3_t __s1 = __p1; \ + float32x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \ + float16x8x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \ +}) +#else +#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \ + float16x8x3_t __s1 = __p1; \ + float16x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \ + int32x4x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \ +}) +#else +#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \ + int32x4x3_t __s1 = __p1; \ + int32x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \ + int64x2x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \ +}) +#else +#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \ + int64x2x3_t __s1 = __p1; \ + int64x2x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \ + int16x8x3_t __s1 = __p1; \ + __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \ +}) +#else +#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \ + int16x8x3_t __s1 = __p1; \ + int16x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u8_x3(__p0, __p1) __extension__ ({ \ + uint8x8x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \ +}) +#else +#define vst1_u8_x3(__p0, __p1) __extension__ ({ \ + uint8x8x3_t __s1 = __p1; \ + uint8x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u32_x3(__p0, __p1) __extension__ ({ \ + uint32x2x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \ +}) +#else +#define vst1_u32_x3(__p0, __p1) __extension__ ({ \ + uint32x2x3_t __s1 = __p1; \ + uint32x2x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u64_x3(__p0, __p1) __extension__ ({ \ + uint64x1x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \ +}) +#else +#define vst1_u64_x3(__p0, __p1) __extension__ ({ \ + uint64x1x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u16_x3(__p0, __p1) __extension__ ({ \ + uint16x4x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \ +}) +#else +#define vst1_u16_x3(__p0, __p1) __extension__ ({ \ + uint16x4x3_t __s1 = __p1; \ + uint16x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s8_x3(__p0, __p1) __extension__ ({ \ + int8x8x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \ +}) +#else +#define vst1_s8_x3(__p0, __p1) __extension__ ({ \ + int8x8x3_t __s1 = __p1; \ + int8x8x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_f32_x3(__p0, __p1) __extension__ ({ \ + float32x2x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \ +}) +#else +#define vst1_f32_x3(__p0, __p1) __extension__ ({ \ + float32x2x3_t __s1 = __p1; \ + float32x2x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_f16_x3(__p0, __p1) __extension__ ({ \ + float16x4x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \ +}) +#else +#define vst1_f16_x3(__p0, __p1) __extension__ ({ \ + float16x4x3_t __s1 = __p1; \ + float16x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s32_x3(__p0, __p1) __extension__ ({ \ + int32x2x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \ +}) +#else +#define vst1_s32_x3(__p0, __p1) __extension__ ({ \ + int32x2x3_t __s1 = __p1; \ + int32x2x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s64_x3(__p0, __p1) __extension__ ({ \ + int64x1x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \ +}) +#else +#define vst1_s64_x3(__p0, __p1) __extension__ ({ \ + int64x1x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s16_x3(__p0, __p1) __extension__ ({ \ + int16x4x3_t __s1 = __p1; \ + __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \ +}) +#else +#define vst1_s16_x3(__p0, __p1) __extension__ ({ \ + int16x4x3_t __s1 = __p1; \ + int16x4x3_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_p8_x4(__p0, __p1) __extension__ ({ \ + poly8x8x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \ +}) +#else +#define vst1_p8_x4(__p0, __p1) __extension__ ({ \ + poly8x8x4_t __s1 = __p1; \ + poly8x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_p16_x4(__p0, __p1) __extension__ ({ \ + poly16x4x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \ +}) +#else +#define vst1_p16_x4(__p0, __p1) __extension__ ({ \ + poly16x4x4_t __s1 = __p1; \ + poly16x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \ + poly8x16x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \ +}) +#else +#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \ + poly8x16x4_t __s1 = __p1; \ + poly8x16x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \ + poly16x8x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \ +}) +#else +#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \ + poly16x8x4_t __s1 = __p1; \ + poly16x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \ + uint8x16x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \ +}) +#else +#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \ + uint8x16x4_t __s1 = __p1; \ + uint8x16x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \ + uint32x4x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \ +}) +#else +#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \ + uint32x4x4_t __s1 = __p1; \ + uint32x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \ + uint64x2x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \ +}) +#else +#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \ + uint64x2x4_t __s1 = __p1; \ + uint64x2x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \ + uint16x8x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \ +}) +#else +#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \ + uint16x8x4_t __s1 = __p1; \ + uint16x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \ + int8x16x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \ +}) +#else +#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \ + int8x16x4_t __s1 = __p1; \ + int8x16x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \ + float32x4x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \ +}) +#else +#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \ + float32x4x4_t __s1 = __p1; \ + float32x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \ + float16x8x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \ +}) +#else +#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \ + float16x8x4_t __s1 = __p1; \ + float16x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \ + int32x4x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \ +}) +#else +#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \ + int32x4x4_t __s1 = __p1; \ + int32x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \ + int64x2x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \ +}) +#else +#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \ + int64x2x4_t __s1 = __p1; \ + int64x2x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \ + int16x8x4_t __s1 = __p1; \ + __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \ +}) +#else +#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \ + int16x8x4_t __s1 = __p1; \ + int16x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u8_x4(__p0, __p1) __extension__ ({ \ + uint8x8x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \ +}) +#else +#define vst1_u8_x4(__p0, __p1) __extension__ ({ \ + uint8x8x4_t __s1 = __p1; \ + uint8x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u32_x4(__p0, __p1) __extension__ ({ \ + uint32x2x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \ +}) +#else +#define vst1_u32_x4(__p0, __p1) __extension__ ({ \ + uint32x2x4_t __s1 = __p1; \ + uint32x2x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u64_x4(__p0, __p1) __extension__ ({ \ + uint64x1x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \ +}) +#else +#define vst1_u64_x4(__p0, __p1) __extension__ ({ \ + uint64x1x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_u16_x4(__p0, __p1) __extension__ ({ \ + uint16x4x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \ +}) +#else +#define vst1_u16_x4(__p0, __p1) __extension__ ({ \ + uint16x4x4_t __s1 = __p1; \ + uint16x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s8_x4(__p0, __p1) __extension__ ({ \ + int8x8x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \ +}) +#else +#define vst1_s8_x4(__p0, __p1) __extension__ ({ \ + int8x8x4_t __s1 = __p1; \ + int8x8x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_f32_x4(__p0, __p1) __extension__ ({ \ + float32x2x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \ +}) +#else +#define vst1_f32_x4(__p0, __p1) __extension__ ({ \ + float32x2x4_t __s1 = __p1; \ + float32x2x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_f16_x4(__p0, __p1) __extension__ ({ \ + float16x4x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \ +}) +#else +#define vst1_f16_x4(__p0, __p1) __extension__ ({ \ + float16x4x4_t __s1 = __p1; \ + float16x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s32_x4(__p0, __p1) __extension__ ({ \ + int32x2x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \ +}) +#else +#define vst1_s32_x4(__p0, __p1) __extension__ ({ \ + int32x2x4_t __s1 = __p1; \ + int32x2x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s64_x4(__p0, __p1) __extension__ ({ \ + int64x1x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \ +}) +#else +#define vst1_s64_x4(__p0, __p1) __extension__ ({ \ + int64x1x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vst1_s16_x4(__p0, __p1) __extension__ ({ \ + int16x4x4_t __s1 = __p1; \ + __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \ +}) +#else +#define vst1_s16_x4(__p0, __p1) __extension__ ({ \ + int16x4x4_t __s1 = __p1; \ + int16x4x4_t __rev1; \ + __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ + __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ + __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ + __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ + __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vst2_p8(__p0, __p1) __extension__ ({ \ poly8x8x2_t __s1 = __p1; \ __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \ @@ -29838,6 +32886,110 @@ __ai int16x4x2_t vzip_s16(int16x4_t __p0, int16x4_t __p1) { #if !defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ +#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \ + __ret; \ +}) +#else +#define vdupq_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1, __p1, __p1, __p1, __p1); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdup_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = __builtin_shufflevector(__s0, __s0, __p1, __p1, __p1, __p1); \ + __ret; \ +}) +#else +#define vdup_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __builtin_shufflevector(__rev0, __rev0, __p1, __p1, __p1, __p1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdupq_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \ + __ret; \ +}) +#else +#define vdupq_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vdup_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \ + __ret; \ +}) +#else +#define vdup_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmovq_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \ + __ret; \ +}) +#else +#define vmovq_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x8_t __ret; \ + __ret = (float16x8_t) {__s0, __s0, __s0, __s0, __s0, __s0, __s0, __s0}; \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmov_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \ + __ret; \ +}) +#else +#define vmov_n_f16(__p0) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x4_t __ret; \ + __ret = (float16x4_t) {__s0, __s0, __s0, __s0}; \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ __ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __p0) { poly8x8_t __ret; __ret = (poly8x8_t)(__p0); @@ -33836,6 +36988,245 @@ __ai uint32x2_t vcvtp_u32_f32(float32x2_t __p0) { #endif #endif +#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO) +#ifdef __LITTLE_ENDIAN__ +__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) { + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48); + return __ret; +} +#else +__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) { + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48); + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) { + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__p0, (int8x16_t)__p1, 48); + return __ret; +} +#else +__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) { + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48); + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) { + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__p0, 48); + return __ret; +} +#else +__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) { + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__rev0, 48); + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) { + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__p0, 48); + return __ret; +} +#else +__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) { + uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x16_t __ret; + __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__rev0, 48); + __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2); + return __ret; +} +#else +__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32_t vsha1h_u32(uint32_t __p0) { + uint32_t __ret; + __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0); + return __ret; +} +#else +__ai uint32_t vsha1h_u32(uint32_t __p0) { + uint32_t __ret; + __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2); + return __ret; +} +#else +__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2); + return __ret; +} +#else +__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); + return __ret; +} +#else +__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__p0, (int8x16_t)__p1, 50); + return __ret; +} +#else +__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); + return __ret; +} +#else +__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); + return __ret; +} +#else +__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__p0, (int8x16_t)__p1, 50); + return __ret; +} +#else +__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); + return __ret; +} +#else +__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#endif #if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING) #ifdef __LITTLE_ENDIAN__ __ai float32x4_t vrndq_f32(float32x4_t __p0) { @@ -33902,6 +37293,38 @@ __ai float32x2_t vrnda_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ +__ai float32x4_t vrndiq_f32(float32x4_t __p0) { + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41); + return __ret; +} +#else +__ai float32x4_t vrndiq_f32(float32x4_t __p0) { + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __ret; + __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x2_t vrndi_f32(float32x2_t __p0) { + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9); + return __ret; +} +#else +__ai float32x2_t vrndi_f32(float32x2_t __p0) { + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __ret; + __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ __ai float32x4_t vrndmq_f32(float32x4_t __p0) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 41); @@ -33966,6 +37389,20 @@ __ai float32x2_t vrndn_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ +__ai float32_t vrndns_f32(float32_t __p0) { + float32_t __ret; + __ret = (float32_t) __builtin_neon_vrndns_f32(__p0); + return __ret; +} +#else +__ai float32_t vrndns_f32(float32_t __p0) { + float32_t __ret; + __ret = (float32_t) __builtin_neon_vrndns_f32(__p0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ __ai float32x4_t vrndpq_f32(float32x4_t __p0) { float32x4_t __ret; __ret = (float32x4_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 41); @@ -34030,6 +37467,200 @@ __ai float32x2_t vrndx_f32(float32x2_t __p0) { #endif #endif +#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_DIRECTED_ROUNDING) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrnd_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrnd_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndaq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndaq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrnda_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrnda_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndmq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndmq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndm_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndm_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndnq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndnq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndn_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndn_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndpq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndpq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndp_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndp_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrndxq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrndxq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrndx_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrndx_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#endif #if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN) #ifdef __LITTLE_ENDIAN__ __ai float32x4_t vmaxnmq_f32(float32x4_t __p0, float32x4_t __p1) { @@ -34100,6 +37731,76 @@ __ai float32x2_t vminnm_f32(float32x2_t __p0, float32x2_t __p1) { #endif #endif +#if __ARM_ARCH >= 8 && defined(__ARM_FEATURE_NUMERIC_MAXMIN) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#endif #if __ARM_ARCH >= 8 && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ __ai int64x2_t vcvtaq_s64_f64(float64x2_t __p0) { @@ -39908,22 +43609,6 @@ __ai float64x2_t vrndiq_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x4_t vrndiq_f32(float32x4_t __p0) { - float32x4_t __ret; - __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 41); - return __ret; -} -#else -__ai float32x4_t vrndiq_f32(float32x4_t __p0) { - float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float32x4_t __ret; - __ret = (float32x4_t) __builtin_neon_vrndiq_v((int8x16_t)__rev0, 41); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float64x1_t vrndi_f64(float64x1_t __p0) { float64x1_t __ret; __ret = (float64x1_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 10); @@ -39938,22 +43623,6 @@ __ai float64x1_t vrndi_f64(float64x1_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x2_t vrndi_f32(float32x2_t __p0) { - float32x2_t __ret; - __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__p0, 9); - return __ret; -} -#else -__ai float32x2_t vrndi_f32(float32x2_t __p0) { - float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - float32x2_t __ret; - __ret = (float32x2_t) __builtin_neon_vrndi_v((int8x8_t)__rev0, 9); - __ret = __builtin_shufflevector(__ret, __ret, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float64x2_t vrndmq_f64(float64x2_t __p0) { float64x2_t __ret; __ret = (float64x2_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 42); @@ -40138,242 +43807,323 @@ __ai float64x1_t vminnm_f64(float64x1_t __p0, float64x1_t __p1) { #endif #endif -#if __ARM_FEATURE_CRYPTO +#if defined(__ARM_FEATURE_DOTPROD) #ifdef __LITTLE_ENDIAN__ -__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) { - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__p0, (int8x16_t)__p1, 48); +__ai uint32x4_t vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); return __ret; } #else -__ai uint8x16_t vaesdq_u8(uint8x16_t __p0, uint8x16_t __p1) { - uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +__ai uint32x4_t vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) { + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaesdq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48); - __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) { - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__p0, (int8x16_t)__p1, 48); + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vdotq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -#else -__ai uint8x16_t vaeseq_u8(uint8x16_t __p0, uint8x16_t __p1) { - uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaeseq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 48); - __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +__ai uint32x4_t __noswap_vdotq_u32(uint32x4_t __p0, uint8x16_t __p1, uint8x16_t __p2) { + uint32x4_t __ret; + __ret = (uint32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) { - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__p0, 48); +__ai int32x4_t vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) { + int32x4_t __ret; + __ret = (int32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34); return __ret; } #else -__ai uint8x16_t vaesimcq_u8(uint8x16_t __p0) { - uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaesimcq_v((int8x16_t)__rev0, 48); - __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) { - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__p0, 48); +__ai int32x4_t vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) { + int32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + int8x16_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + int8x16_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + int32x4_t __ret; + __ret = (int32x4_t) __builtin_neon_vdotq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 34); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -#else -__ai uint8x16_t vaesmcq_u8(uint8x16_t __p0) { - uint8x16_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); - uint8x16_t __ret; - __ret = (uint8x16_t) __builtin_neon_vaesmcq_v((int8x16_t)__rev0, 48); - __ret = __builtin_shufflevector(__ret, __ret, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); +__ai int32x4_t __noswap_vdotq_s32(int32x4_t __p0, int8x16_t __p1, int8x16_t __p2) { + int32x4_t __ret; + __ret = (int32x4_t) __builtin_neon_vdotq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 34); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2); +__ai uint32x2_t vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) { + uint32x2_t __ret; + __ret = (uint32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18); return __ret; } #else -__ai uint32x4_t vsha1cq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1cq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); +__ai uint32x2_t vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) { + uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + uint32x2_t __ret; + __ret = (uint32x2_t) __builtin_neon_vdot_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 18); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +__ai uint32x2_t __noswap_vdot_u32(uint32x2_t __p0, uint8x8_t __p1, uint8x8_t __p2) { + uint32x2_t __ret; + __ret = (uint32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 18); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32_t vsha1h_u32(uint32_t __p0) { - uint32_t __ret; - __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0); +__ai int32x2_t vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) { + int32x2_t __ret; + __ret = (int32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2); return __ret; } #else -__ai uint32_t vsha1h_u32(uint32_t __p0) { - uint32_t __ret; - __ret = (uint32_t) __builtin_neon_vsha1h_u32(__p0); +__ai int32x2_t vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) { + int32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + int8x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + int8x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + int32x2_t __ret; + __ret = (int32x2_t) __builtin_neon_vdot_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 2); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +__ai int32x2_t __noswap_vdot_s32(int32x2_t __p0, int8x8_t __p1, int8x8_t __p2) { + int32x2_t __ret; + __ret = (int32x2_t) __builtin_neon_vdot_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 2); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2); - return __ret; -} +#define vdotq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x8_t __s2 = __p2; \ + uint32x4_t __ret; \ +uint8x8_t __reint = __s2; \ +uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = vdotq_u32(__s0, __s1, *(uint8x16_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha1mq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1mq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdotq_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x8_t __s2 = __p2; \ + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint32x4_t __ret; \ +uint8x8_t __reint = __rev2; \ +uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = __noswap_vdotq_u32(__rev0, __rev1, *(uint8x16_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__p0, __p1, (int8x16_t)__p2); - return __ret; -} +#define vdotq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x4_t __s0 = __p0; \ + int8x16_t __s1 = __p1; \ + int8x8_t __s2 = __p2; \ + int32x4_t __ret; \ +int8x8_t __reint = __s2; \ +int32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = vdotq_s32(__s0, __s1, *(int8x16_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha1pq_u32(uint32x4_t __p0, uint32_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1pq_u32((int8x16_t)__rev0, __p1, (int8x16_t)__rev2); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdotq_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x4_t __s0 = __p0; \ + int8x16_t __s1 = __p1; \ + int8x8_t __s2 = __p2; \ + int32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + int8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32x4_t __ret; \ +int8x8_t __reint = __rev2; \ +int32x4_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = __noswap_vdotq_s32(__rev0, __rev1, *(int8x16_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); - return __ret; -} +#define vdot_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x8_t __s2 = __p2; \ + uint32x2_t __ret; \ +uint8x8_t __reint = __s2; \ +uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \ + __ret = vdot_u32(__s0, __s1, *(uint8x8_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha1su0q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdot_lane_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x8_t __s2 = __p2; \ + uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint32x2_t __ret; \ +uint8x8_t __reint = __rev2; \ +uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \ + __ret = __noswap_vdot_u32(__rev0, __rev1, *(uint8x8_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \ + __ret; \ +}) #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__p0, (int8x16_t)__p1, 50); - return __ret; -} +#define vdot_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x2_t __s0 = __p0; \ + int8x8_t __s1 = __p1; \ + int8x8_t __s2 = __p2; \ + int32x2_t __ret; \ +int8x8_t __reint = __s2; \ +int32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \ + __ret = vdot_s32(__s0, __s1, *(int8x8_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha1su1q_u32(uint32x4_t __p0, uint32x4_t __p1) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha1su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdot_lane_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x2_t __s0 = __p0; \ + int8x8_t __s1 = __p1; \ + int8x8_t __s2 = __p2; \ + int32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \ + int8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x8_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32x2_t __ret; \ +int8x8_t __reint = __rev2; \ +int32x2_t __reint1 = __builtin_shufflevector(*(uint32x2_t *) &__reint, *(uint32x2_t *) &__reint, __p3, __p3); \ + __ret = __noswap_vdot_s32(__rev0, __rev1, *(int8x8_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \ + __ret; \ +}) #endif +#endif +#if defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__) #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); - return __ret; -} +#define vdotq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x16_t __s2 = __p2; \ + uint32x4_t __ret; \ +uint8x16_t __reint = __s2; \ +uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = vdotq_u32(__s0, __s1, *(uint8x16_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha256hq_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256hq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdotq_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x4_t __s0 = __p0; \ + uint8x16_t __s1 = __p1; \ + uint8x16_t __s2 = __p2; \ + uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + uint8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint32x4_t __ret; \ +uint8x16_t __reint = __rev2; \ +uint32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = __noswap_vdotq_u32(__rev0, __rev1, *(uint8x16_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); - return __ret; -} +#define vdotq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x4_t __s0 = __p0; \ + int8x16_t __s1 = __p1; \ + int8x16_t __s2 = __p2; \ + int32x4_t __ret; \ +int8x16_t __reint = __s2; \ +int32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = vdotq_s32(__s0, __s1, *(int8x16_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha256h2q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256h2q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdotq_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x4_t __s0 = __p0; \ + int8x16_t __s1 = __p1; \ + int8x16_t __s2 = __p2; \ + int32x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + int8x16_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32x4_t __ret; \ +int8x16_t __reint = __rev2; \ +int32x4_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3, __p3, __p3); \ + __ret = __noswap_vdotq_s32(__rev0, __rev1, *(int8x16_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__p0, (int8x16_t)__p1, 50); - return __ret; -} +#define vdot_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x16_t __s2 = __p2; \ + uint32x2_t __ret; \ +uint8x16_t __reint = __s2; \ +uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \ + __ret = vdot_u32(__s0, __s1, *(uint8x8_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha256su0q_u32(uint32x4_t __p0, uint32x4_t __p1) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256su0q_v((int8x16_t)__rev0, (int8x16_t)__rev1, 50); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdot_laneq_u32(__p0, __p1, __p2, __p3) __extension__ ({ \ + uint32x2_t __s0 = __p0; \ + uint8x8_t __s1 = __p1; \ + uint8x16_t __s2 = __p2; \ + uint32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \ + uint8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + uint32x2_t __ret; \ +uint8x16_t __reint = __rev2; \ +uint32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \ + __ret = __noswap_vdot_u32(__rev0, __rev1, *(uint8x8_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \ + __ret; \ +}) #endif #ifdef __LITTLE_ENDIAN__ -__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 50); - return __ret; -} +#define vdot_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x2_t __s0 = __p0; \ + int8x8_t __s1 = __p1; \ + int8x16_t __s2 = __p2; \ + int32x2_t __ret; \ +int8x16_t __reint = __s2; \ +int32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \ + __ret = vdot_s32(__s0, __s1, *(int8x8_t *) &__reint1); \ + __ret; \ +}) #else -__ai uint32x4_t vsha256su1q_u32(uint32x4_t __p0, uint32x4_t __p1, uint32x4_t __p2) { - uint32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - uint32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - uint32x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - uint32x4_t __ret; - __ret = (uint32x4_t) __builtin_neon_vsha256su1q_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 50); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} +#define vdot_laneq_s32(__p0, __p1, __p2, __p3) __extension__ ({ \ + int32x2_t __s0 = __p0; \ + int8x8_t __s1 = __p1; \ + int8x16_t __s2 = __p2; \ + int32x2_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 1, 0); \ + int8x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + int8x16_t __rev2; __rev2 = __builtin_shufflevector(__s2, __s2, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ + int32x2_t __ret; \ +int8x16_t __reint = __rev2; \ +int32x2_t __reint1 = __builtin_shufflevector(*(uint32x4_t *) &__reint, *(uint32x4_t *) &__reint, __p3, __p3); \ + __ret = __noswap_vdot_s32(__rev0, __rev1, *(int8x8_t *) &__reint1); \ + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); \ + __ret; \ +}) #endif #endif @@ -40425,6 +44175,40 @@ __ai float32x2_t __noswap_vfma_f32(float32x2_t __p0, float32x2_t __p1, float32x2 #endif #ifdef __LITTLE_ENDIAN__ +__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) { + float32x4_t __ret; + __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2}); + return __ret; +} +#else +__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) { + float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float32x4_t __ret; + __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2}); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) { + float32x2_t __ret; + __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2}); + return __ret; +} +#else +__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) { + float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); + float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); + float32x2_t __ret; + __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2}); + __ret = __builtin_shufflevector(__ret, __ret, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ __ai float32x4_t vfmsq_f32(float32x4_t __p0, float32x4_t __p1, float32x4_t __p2) { float32x4_t __ret; __ret = vfmaq_f32(__p0, -__p1, __p2); @@ -40461,7 +44245,7 @@ __ai float32x2_t vfms_f32(float32x2_t __p0, float32x2_t __p1, float32x2_t __p2) #endif #endif -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__) +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #ifdef __LITTLE_ENDIAN__ __ai float16x8_t vabdq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; @@ -41593,162 +45377,779 @@ __ai uint16x4_t vcvtp_u16_f16(float16x4_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { +#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \ + __ret; \ +}) +#else +#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 8); \ + __ret; \ +}) +#else +#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 8); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { float16x8_t __ret; - __ret = __p0 / __p1; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); return __ret; } #else -__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { +__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __ret; - __ret = __rev0 / __rev1; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +__ai float16x8_t __noswap_vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#else +__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +__ai float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __ret; + __ret = vfmaq_f16(__p0, -__p1, __p2); + return __ret; +} +#else +__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __noswap_vfmaq_f16(__rev0, -__rev1, __rev2); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { +__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { float16x4_t __ret; - __ret = __p0 / __p1; + __ret = vfma_f16(__p0, -__p1, __p2); return __ret; } #else -__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { +__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); float16x4_t __ret; - __ret = __rev0 / __rev1; + __ret = __noswap_vfma_f16(__rev0, -__rev1, __rev2); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ - float16x4_t __s0 = __p0; \ - float16_t __ret; \ - __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__s0, __p1); \ +__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 * __p1; + return __ret; +} +#else +__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 * __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 * __p1; + return __ret; +} +#else +__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 * __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ __ret; \ }) #else -#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ - float16x4_t __s0 = __p0; \ - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ - float16_t __ret; \ - __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__rev0, __p1); \ +#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x8_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ - float16x8_t __s0 = __p0; \ - float16_t __ret; \ - __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__s0, __p1); \ +#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __ret; \ + __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \ __ret; \ }) #else -#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ - float16x8_t __s0 = __p0; \ - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16_t __ret; \ - __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__rev0, __p1); \ +#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16x4_t __ret; \ + __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \ + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ +#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ float16x8_t __s0 = __p0; \ - float16x8_t __s1 = __p1; \ + float16_t __s1 = __p1; \ float16x8_t __ret; \ - __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__s0, (int8x16_t)__s1, __p2, 40); \ + __ret = __s0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ __ret; \ }) #else -#define vextq_f16(__p0, __p1, __p2) __extension__ ({ \ +#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ float16x8_t __s0 = __p0; \ - float16x8_t __s1 = __p1; \ + float16_t __s1 = __p1; \ float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ float16x8_t __ret; \ - __ret = (float16x8_t) __builtin_neon_vextq_v((int8x16_t)__rev0, (int8x16_t)__rev1, __p2, 40); \ + __ret = __rev0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ +#define vmul_n_f16(__p0, __p1) __extension__ ({ \ float16x4_t __s0 = __p0; \ - float16x4_t __s1 = __p1; \ + float16_t __s1 = __p1; \ float16x4_t __ret; \ - __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__s0, (int8x8_t)__s1, __p2, 8); \ + __ret = __s0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ __ret; \ }) #else -#define vext_f16(__p0, __p1, __p2) __extension__ ({ \ +#define vmul_n_f16(__p0, __p1) __extension__ ({ \ float16x4_t __s0 = __p0; \ - float16x4_t __s1 = __p1; \ + float16_t __s1 = __p1; \ float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ float16x4_t __ret; \ - __ret = (float16x4_t) __builtin_neon_vext_v((int8x8_t)__rev0, (int8x8_t)__rev1, __p2, 8); \ + __ret = __rev0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ __ret; \ }) #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float16x8_t vnegq_f16(float16x8_t __p0) { float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + __ret = -__p0; return __ret; } #else -__ai float16x8_t vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { +__ai float16x8_t vnegq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = -__rev0; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vneg_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = -__p0; + return __ret; +} +#else +__ai float16x4_t vneg_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = -__rev0; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrecpe_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrecpe_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__rev0, (int8x16_t)__rev1, (int8x16_t)__rev2, 40); + __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); return __ret; } -__ai float16x8_t __noswap_vfmaq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrev64q_f16(float16x8_t __p0) { float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vfmaq_v((int8x16_t)__p0, (int8x16_t)__p1, (int8x16_t)__p2, 40); + __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4); + return __ret; +} +#else +__ai float16x8_t vrev64q_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float16x4_t vrev64_f16(float16x4_t __p0) { float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); return __ret; } #else -__ai float16x4_t vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { +__ai float16x4_t vrev64_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 40); + return __ret; +} +#else +__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 8); + return __ret; +} +#else +__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 8); + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__rev0, (int8x8_t)__rev1, (int8x8_t)__rev2, 8); + __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); return __ret; } -__ai float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 - __p1; + return __ret; +} +#else +__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 - __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vfma_v((int8x8_t)__p0, (int8x8_t)__p1, (int8x8_t)__p2, 8); + __ret = __p0 - __p1; + return __ret; +} +#else +__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 - __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); return __ret; } #endif #ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8x2_t __ret; + __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); + return __ret; +} +#else +__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8x2_t __ret; + __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4x2_t __ret; + __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); + return __ret; +} +#else +__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4x2_t __ret; + __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); + + __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); + __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); + return __ret; +} +#endif + +#endif +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarch64__) +#ifdef __LITTLE_ENDIAN__ +__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __ret; + __ret = __p0 / __p1; + return __ret; +} +#else +__ai float16x8_t vdivq_f16(float16x8_t __p0, float16x8_t __p1) { + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); + float16x8_t __ret; + __ret = __rev0 / __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __ret; + __ret = __p0 / __p1; + return __ret; +} +#else +__ai float16x4_t vdiv_f16(float16x4_t __p0, float16x4_t __p1) { + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); + float16x4_t __ret; + __ret = __rev0 / __rev1; + __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__s0, __p1); \ + __ret; \ +}) +#else +#define vduph_lane_f16(__p0, __p1) __extension__ ({ \ + float16x4_t __s0 = __p0; \ + float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_lane_f16((int8x8_t)__rev0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ +#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__s0, __p1); \ + __ret; \ +}) +#else +#define vduph_laneq_f16(__p0, __p1) __extension__ ({ \ + float16x8_t __s0 = __p0; \ + float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vduph_laneq_f16((int8x16_t)__rev0, __p1); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vfmah_lane_f16(__p0, __p1, __p2, __p3) __extension__ ({ \ float16_t __s0 = __p0; \ float16_t __s1 = __p1; \ @@ -41981,42 +46382,6 @@ __ai float16x4_t __noswap_vfma_f16(float16x4_t __p0, float16x4_t __p1, float16x4 #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { - float16x8_t __ret; - __ret = vfmaq_f16(__p0, -__p1, __p2); - return __ret; -} -#else -__ai float16x8_t vfmsq_f16(float16x8_t __p0, float16x8_t __p1, float16x8_t __p2) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = __noswap_vfmaq_f16(__rev0, -__rev1, __rev2); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { - float16x4_t __ret; - __ret = vfma_f16(__p0, -__p1, __p2); - return __ret; -} -#else -__ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __rev2; __rev2 = __builtin_shufflevector(__p2, __p2, 3, 2, 1, 0); - float16x4_t __ret; - __ret = __noswap_vfma_f16(__rev0, -__rev1, __rev2); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ #define vfmsh_lane_f16(__p0_0, __p1_0, __p2_0, __p3_0) __extension__ ({ \ float16_t __s0_0 = __p0_0; \ float16_t __s1_0 = __p1_0; \ @@ -42201,74 +46566,6 @@ __ai float16x4_t vfms_f16(float16x4_t __p0, float16x4_t __p1, float16x4_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8_t vmaxq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vmaxq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vmax_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8_t vmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vmaxnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vmaxnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ #define vmaxnmvq_f16(__p0) __extension__ ({ \ float16x8_t __s0 = __p0; \ float16_t __ret; \ @@ -42337,74 +46634,6 @@ __ai float16x4_t vmaxnm_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8_t vminq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vminq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vmin_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8_t vminnmq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vminnmq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vminnm_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ #define vminnmvq_f16(__p0) __extension__ ({ \ float16x8_t __s0 = __p0; \ float16_t __ret; \ @@ -42473,82 +46702,6 @@ __ai float16x4_t vminnm_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = __p0 * __p1; - return __ret; -} -#else -__ai float16x8_t vmulq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = __rev0 * __rev1; - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = __p0 * __p1; - return __ret; -} -#else -__ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = __rev0 * __rev1; - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ - float16x8_t __s0 = __p0; \ - float16x4_t __s1 = __p1; \ - float16x8_t __ret; \ - __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ - __ret; \ -}) -#else -#define vmulq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ - float16x8_t __s0 = __p0; \ - float16x4_t __s1 = __p1; \ - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ - float16x8_t __ret; \ - __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2, __p2, __p2, __p2, __p2); \ - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ - float16x4_t __s0 = __p0; \ - float16x4_t __s1 = __p1; \ - float16x4_t __ret; \ - __ret = __s0 * __builtin_shufflevector(__s1, __s1, __p2, __p2, __p2, __p2); \ - __ret; \ -}) -#else -#define vmul_lane_f16(__p0, __p1, __p2) __extension__ ({ \ - float16x4_t __s0 = __p0; \ - float16x4_t __s1 = __p1; \ - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ - float16x4_t __ret; \ - __ret = __rev0 * __builtin_shufflevector(__rev1, __rev1, __p2, __p2, __p2, __p2); \ - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vmulq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ float16x8_t __s0 = __p0; \ float16x8_t __s1 = __p1; \ @@ -42591,46 +46744,6 @@ __ai float16x4_t vmul_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ - float16x8_t __s0 = __p0; \ - float16_t __s1 = __p1; \ - float16x8_t __ret; \ - __ret = __s0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ - __ret; \ -}) -#else -#define vmulq_n_f16(__p0, __p1) __extension__ ({ \ - float16x8_t __s0 = __p0; \ - float16_t __s1 = __p1; \ - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 7, 6, 5, 4, 3, 2, 1, 0); \ - float16x8_t __ret; \ - __ret = __rev0 * (float16x8_t) {__s1, __s1, __s1, __s1, __s1, __s1, __s1, __s1}; \ - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vmul_n_f16(__p0, __p1) __extension__ ({ \ - float16x4_t __s0 = __p0; \ - float16_t __s1 = __p1; \ - float16x4_t __ret; \ - __ret = __s0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ - __ret; \ -}) -#else -#define vmul_n_f16(__p0, __p1) __extension__ ({ \ - float16x4_t __s0 = __p0; \ - float16_t __s1 = __p1; \ - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__s0, __s0, 3, 2, 1, 0); \ - float16x4_t __ret; \ - __ret = __rev0 * (float16x4_t) {__s1, __s1, __s1, __s1}; \ - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vmulxq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vmulxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); @@ -42675,6 +46788,25 @@ __ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vmulxh_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulxh_lane_f16(__s0, (int8x8_t)__s1, __p2); \ + __ret; \ +}) +#else +#define vmulxh_lane_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x4_t __s1 = __p1; \ + float16x4_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulxh_lane_f16(__s0, (int8x8_t)__rev1, __p2); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vmulxq_lane_f16(__p0, __p1, __p2) __extension__ ({ \ float16x8_t __s0 = __p0; \ float16x4_t __s1 = __p1; \ @@ -42717,6 +46849,25 @@ __ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ +#define vmulxh_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulxh_laneq_f16(__s0, (int8x16_t)__s1, __p2); \ + __ret; \ +}) +#else +#define vmulxh_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ + float16_t __s0 = __p0; \ + float16x8_t __s1 = __p1; \ + float16x8_t __rev1; __rev1 = __builtin_shufflevector(__s1, __s1, 7, 6, 5, 4, 3, 2, 1, 0); \ + float16_t __ret; \ + __ret = (float16_t) __builtin_neon_vmulxh_laneq_f16(__s0, (int8x16_t)__rev1, __p2); \ + __ret; \ +}) +#endif + +#ifdef __LITTLE_ENDIAN__ #define vmulxq_laneq_f16(__p0, __p1, __p2) __extension__ ({ \ float16x8_t __s0 = __p0; \ float16x8_t __s1 = __p1; \ @@ -42799,38 +46950,6 @@ __ai float16x4_t __noswap_vmulx_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vnegq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = -__p0; - return __ret; -} -#else -__ai float16x8_t vnegq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = -__rev0; - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vneg_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = -__p0; - return __ret; -} -#else -__ai float16x4_t vneg_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = -__rev0; - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vpaddq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); @@ -42848,23 +46967,6 @@ __ai float16x8_t vpaddq_f16(float16x8_t __p0, float16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vpadd_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vpadd_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vpmaxq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); @@ -42882,23 +46984,6 @@ __ai float16x8_t vpmaxq_f16(float16x8_t __p0, float16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vpmax_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vpmax_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vpmaxnmq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vpmaxnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); @@ -42950,23 +47035,6 @@ __ai float16x8_t vpminq_f16(float16x8_t __p0, float16x8_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vpmin_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vpmin_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vpminnmq_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vpminnmq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); @@ -43001,168 +47069,6 @@ __ai float16x4_t vpminnm_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrecpeq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrecpeq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrecpe_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrecpe_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrecpe_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8_t vrecpsq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrecpsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vrecps_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrecps_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrev64q_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0, 7, 6, 5, 4); - return __ret; -} -#else -__ai float16x8_t vrev64q_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0, 7, 6, 5, 4); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrev64_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - return __ret; -} -#else -__ai float16x4_t vrev64_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = __builtin_shufflevector(__rev0, __rev0, 3, 2, 1, 0); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrndq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrndq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrnd_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrnd_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrnd_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrndaq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrndaq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndaq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrnda_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrnda_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrnda_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vrndiq_f16(float16x8_t __p0) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vrndiq_v((int8x16_t)__p0, 40); @@ -43195,200 +47101,6 @@ __ai float16x4_t vrndi_f16(float16x4_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrndmq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrndmq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndmq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrndm_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrndm_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndm_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrndnq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrndnq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndnq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrndn_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrndn_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndn_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrndpq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrndpq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndpq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrndp_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrndp_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndp_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrndxq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrndxq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrndxq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrndx_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrndx_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrndx_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__p0, 40); - return __ret; -} -#else -__ai float16x8_t vrsqrteq_f16(float16x8_t __p0) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrsqrteq_v((int8x16_t)__rev0, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__p0, 8); - return __ret; -} -#else -__ai float16x4_t vrsqrte_f16(float16x4_t __p0) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrsqrte_v((int8x8_t)__rev0, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8_t vrsqrtsq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = (float16x8_t) __builtin_neon_vrsqrtsq_v((int8x16_t)__rev0, (int8x16_t)__rev1, 40); - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4_t vrsqrts_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = (float16x4_t) __builtin_neon_vrsqrts_v((int8x8_t)__rev0, (int8x8_t)__rev1, 8); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vsqrtq_f16(float16x8_t __p0) { float16x8_t __ret; __ret = (float16x8_t) __builtin_neon_vsqrtq_v((int8x16_t)__p0, 40); @@ -43421,78 +47133,6 @@ __ai float16x4_t vsqrt_f16(float16x4_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __ret; - __ret = __p0 - __p1; - return __ret; -} -#else -__ai float16x8_t vsubq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __ret; - __ret = __rev0 - __rev1; - __ret = __builtin_shufflevector(__ret, __ret, 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __ret; - __ret = __p0 - __p1; - return __ret; -} -#else -__ai float16x4_t vsub_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4_t __ret; - __ret = __rev0 - __rev1; - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8x2_t __ret; - __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8x2_t vtrnq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8x2_t __ret; - __builtin_neon_vtrnq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); - - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4x2_t __ret; - __builtin_neon_vtrn_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4x2_t vtrn_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4x2_t __ret; - __builtin_neon_vtrn_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); - - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vtrn1q_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 2, 10, 4, 12, 6, 14); @@ -43561,44 +47201,6 @@ __ai float16x4_t vtrn2_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8x2_t __ret; - __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8x2_t vuzpq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8x2_t __ret; - __builtin_neon_vuzpq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); - - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4x2_t __ret; - __builtin_neon_vuzp_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4x2_t vuzp_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4x2_t __ret; - __builtin_neon_vuzp_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); - - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vuzp1q_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = __builtin_shufflevector(__p0, __p1, 0, 2, 4, 6, 8, 10, 12, 14); @@ -43667,44 +47269,6 @@ __ai float16x4_t vuzp2_f16(float16x4_t __p0, float16x4_t __p1) { #endif #ifdef __LITTLE_ENDIAN__ -__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8x2_t __ret; - __builtin_neon_vzipq_v(&__ret, (int8x16_t)__p0, (int8x16_t)__p1, 40); - return __ret; -} -#else -__ai float16x8x2_t vzipq_f16(float16x8_t __p0, float16x8_t __p1) { - float16x8_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 7, 6, 5, 4, 3, 2, 1, 0); - float16x8x2_t __ret; - __builtin_neon_vzipq_v(&__ret, (int8x16_t)__rev0, (int8x16_t)__rev1, 40); - - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4x2_t __ret; - __builtin_neon_vzip_v(&__ret, (int8x8_t)__p0, (int8x8_t)__p1, 8); - return __ret; -} -#else -__ai float16x4x2_t vzip_f16(float16x4_t __p0, float16x4_t __p1) { - float16x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float16x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float16x4x2_t __ret; - __builtin_neon_vzip_v(&__ret, (int8x8_t)__rev0, (int8x8_t)__rev1, 8); - - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ __ai float16x8_t vzip1q_f16(float16x8_t __p0, float16x8_t __p1) { float16x8_t __ret; __ret = __builtin_shufflevector(__p0, __p1, 0, 8, 1, 9, 2, 10, 3, 11); @@ -50982,35 +54546,15 @@ __ai float64x2_t vfmaq_n_f64(float64x2_t __p0, float64x2_t __p1, float64_t __p2) #endif #ifdef __LITTLE_ENDIAN__ -__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) { - float32x4_t __ret; - __ret = vfmaq_f32(__p0, __p1, (float32x4_t) {__p2, __p2, __p2, __p2}); - return __ret; -} -#else -__ai float32x4_t vfmaq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) { - float32x4_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 3, 2, 1, 0); - float32x4_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 3, 2, 1, 0); - float32x4_t __ret; - __ret = __noswap_vfmaq_f32(__rev0, __rev1, (float32x4_t) {__p2, __p2, __p2, __p2}); - __ret = __builtin_shufflevector(__ret, __ret, 3, 2, 1, 0); - return __ret; -} -#endif - -#ifdef __LITTLE_ENDIAN__ -__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) { - float32x2_t __ret; - __ret = vfma_f32(__p0, __p1, (float32x2_t) {__p2, __p2}); +__ai float64x1_t vfma_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) { + float64x1_t __ret; + __ret = vfma_f64(__p0, __p1, (float64x1_t) {__p2}); return __ret; } #else -__ai float32x2_t vfma_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) { - float32x2_t __rev0; __rev0 = __builtin_shufflevector(__p0, __p0, 1, 0); - float32x2_t __rev1; __rev1 = __builtin_shufflevector(__p1, __p1, 1, 0); - float32x2_t __ret; - __ret = __noswap_vfma_f32(__rev0, __rev1, (float32x2_t) {__p2, __p2}); - __ret = __builtin_shufflevector(__ret, __ret, 1, 0); +__ai float64x1_t vfma_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) { + float64x1_t __ret; + __ret = __noswap_vfma_f64(__p0, __p1, (float64x1_t) {__p2}); return __ret; } #endif @@ -51349,6 +54893,20 @@ __ai float32x4_t vfmsq_n_f32(float32x4_t __p0, float32x4_t __p1, float32_t __p2) #endif #ifdef __LITTLE_ENDIAN__ +__ai float64x1_t vfms_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) { + float64x1_t __ret; + __ret = vfma_f64(__p0, -__p1, (float64x1_t) {__p2}); + return __ret; +} +#else +__ai float64x1_t vfms_n_f64(float64x1_t __p0, float64x1_t __p1, float64_t __p2) { + float64x1_t __ret; + __ret = __noswap_vfma_f64(__p0, -__p1, (float64x1_t) {__p2}); + return __ret; +} +#endif + +#ifdef __LITTLE_ENDIAN__ __ai float32x2_t vfms_n_f32(float32x2_t __p0, float32x2_t __p1, float32_t __p2) { float32x2_t __ret; __ret = vfma_f32(__p0, -__p1, (float32x2_t) {__p2, __p2}); @@ -51705,23 +55263,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_p8_x2(__p0) __extension__ ({ \ - poly8x8x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \ - __ret; \ -}) -#else -#define vld1_p8_x2(__p0) __extension__ ({ \ - poly8x8x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 4); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1_p64_x2(__p0) __extension__ ({ \ poly64x1x2_t __ret; \ __builtin_neon_vld1_x2_v(&__ret, __p0, 6); \ @@ -51736,40 +55277,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_p16_x2(__p0) __extension__ ({ \ - poly16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \ - __ret; \ -}) -#else -#define vld1_p16_x2(__p0) __extension__ ({ \ - poly16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 5); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_p8_x2(__p0) __extension__ ({ \ - poly8x16x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \ - __ret; \ -}) -#else -#define vld1q_p8_x2(__p0) __extension__ ({ \ - poly8x16x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 36); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1q_p64_x2(__p0) __extension__ ({ \ poly64x2x2_t __ret; \ __builtin_neon_vld1q_x2_v(&__ret, __p0, 38); \ @@ -51787,108 +55294,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1q_p16_x2(__p0) __extension__ ({ \ - poly16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \ - __ret; \ -}) -#else -#define vld1q_p16_x2(__p0) __extension__ ({ \ - poly16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 37); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u8_x2(__p0) __extension__ ({ \ - uint8x16x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \ - __ret; \ -}) -#else -#define vld1q_u8_x2(__p0) __extension__ ({ \ - uint8x16x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 48); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u32_x2(__p0) __extension__ ({ \ - uint32x4x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \ - __ret; \ -}) -#else -#define vld1q_u32_x2(__p0) __extension__ ({ \ - uint32x4x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 50); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u64_x2(__p0) __extension__ ({ \ - uint64x2x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \ - __ret; \ -}) -#else -#define vld1q_u64_x2(__p0) __extension__ ({ \ - uint64x2x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 51); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u16_x2(__p0) __extension__ ({ \ - uint16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \ - __ret; \ -}) -#else -#define vld1q_u16_x2(__p0) __extension__ ({ \ - uint16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 49); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s8_x2(__p0) __extension__ ({ \ - int8x16x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \ - __ret; \ -}) -#else -#define vld1q_s8_x2(__p0) __extension__ ({ \ - int8x16x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 32); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1q_f64_x2(__p0) __extension__ ({ \ float64x2x2_t __ret; \ __builtin_neon_vld1q_x2_v(&__ret, __p0, 42); \ @@ -51906,173 +55311,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1q_f32_x2(__p0) __extension__ ({ \ - float32x4x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \ - __ret; \ -}) -#else -#define vld1q_f32_x2(__p0) __extension__ ({ \ - float32x4x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 41); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_f16_x2(__p0) __extension__ ({ \ - float16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \ - __ret; \ -}) -#else -#define vld1q_f16_x2(__p0) __extension__ ({ \ - float16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 40); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s32_x2(__p0) __extension__ ({ \ - int32x4x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \ - __ret; \ -}) -#else -#define vld1q_s32_x2(__p0) __extension__ ({ \ - int32x4x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 34); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s64_x2(__p0) __extension__ ({ \ - int64x2x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \ - __ret; \ -}) -#else -#define vld1q_s64_x2(__p0) __extension__ ({ \ - int64x2x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 35); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s16_x2(__p0) __extension__ ({ \ - int16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \ - __ret; \ -}) -#else -#define vld1q_s16_x2(__p0) __extension__ ({ \ - int16x8x2_t __ret; \ - __builtin_neon_vld1q_x2_v(&__ret, __p0, 33); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u8_x2(__p0) __extension__ ({ \ - uint8x8x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \ - __ret; \ -}) -#else -#define vld1_u8_x2(__p0) __extension__ ({ \ - uint8x8x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 16); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u32_x2(__p0) __extension__ ({ \ - uint32x2x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \ - __ret; \ -}) -#else -#define vld1_u32_x2(__p0) __extension__ ({ \ - uint32x2x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 18); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u64_x2(__p0) __extension__ ({ \ - uint64x1x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \ - __ret; \ -}) -#else -#define vld1_u64_x2(__p0) __extension__ ({ \ - uint64x1x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 19); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u16_x2(__p0) __extension__ ({ \ - uint16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \ - __ret; \ -}) -#else -#define vld1_u16_x2(__p0) __extension__ ({ \ - uint16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 17); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s8_x2(__p0) __extension__ ({ \ - int8x8x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \ - __ret; \ -}) -#else -#define vld1_s8_x2(__p0) __extension__ ({ \ - int8x8x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 0); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1_f64_x2(__p0) __extension__ ({ \ float64x1x2_t __ret; \ __builtin_neon_vld1_x2_v(&__ret, __p0, 10); \ @@ -52087,106 +55325,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_f32_x2(__p0) __extension__ ({ \ - float32x2x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \ - __ret; \ -}) -#else -#define vld1_f32_x2(__p0) __extension__ ({ \ - float32x2x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 9); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_f16_x2(__p0) __extension__ ({ \ - float16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \ - __ret; \ -}) -#else -#define vld1_f16_x2(__p0) __extension__ ({ \ - float16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 8); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s32_x2(__p0) __extension__ ({ \ - int32x2x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \ - __ret; \ -}) -#else -#define vld1_s32_x2(__p0) __extension__ ({ \ - int32x2x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 2); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s64_x2(__p0) __extension__ ({ \ - int64x1x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \ - __ret; \ -}) -#else -#define vld1_s64_x2(__p0) __extension__ ({ \ - int64x1x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 3); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s16_x2(__p0) __extension__ ({ \ - int16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \ - __ret; \ -}) -#else -#define vld1_s16_x2(__p0) __extension__ ({ \ - int16x4x2_t __ret; \ - __builtin_neon_vld1_x2_v(&__ret, __p0, 1); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_p8_x3(__p0) __extension__ ({ \ - poly8x8x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \ - __ret; \ -}) -#else -#define vld1_p8_x3(__p0) __extension__ ({ \ - poly8x8x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 4); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1_p64_x3(__p0) __extension__ ({ \ poly64x1x3_t __ret; \ __builtin_neon_vld1_x3_v(&__ret, __p0, 6); \ @@ -52201,42 +55339,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_p16_x3(__p0) __extension__ ({ \ - poly16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \ - __ret; \ -}) -#else -#define vld1_p16_x3(__p0) __extension__ ({ \ - poly16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 5); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_p8_x3(__p0) __extension__ ({ \ - poly8x16x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \ - __ret; \ -}) -#else -#define vld1q_p8_x3(__p0) __extension__ ({ \ - poly8x16x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 36); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1q_p64_x3(__p0) __extension__ ({ \ poly64x2x3_t __ret; \ __builtin_neon_vld1q_x3_v(&__ret, __p0, 38); \ @@ -52255,114 +55357,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1q_p16_x3(__p0) __extension__ ({ \ - poly16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \ - __ret; \ -}) -#else -#define vld1q_p16_x3(__p0) __extension__ ({ \ - poly16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 37); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u8_x3(__p0) __extension__ ({ \ - uint8x16x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \ - __ret; \ -}) -#else -#define vld1q_u8_x3(__p0) __extension__ ({ \ - uint8x16x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 48); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u32_x3(__p0) __extension__ ({ \ - uint32x4x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \ - __ret; \ -}) -#else -#define vld1q_u32_x3(__p0) __extension__ ({ \ - uint32x4x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 50); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u64_x3(__p0) __extension__ ({ \ - uint64x2x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \ - __ret; \ -}) -#else -#define vld1q_u64_x3(__p0) __extension__ ({ \ - uint64x2x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 51); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u16_x3(__p0) __extension__ ({ \ - uint16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \ - __ret; \ -}) -#else -#define vld1q_u16_x3(__p0) __extension__ ({ \ - uint16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 49); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s8_x3(__p0) __extension__ ({ \ - int8x16x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \ - __ret; \ -}) -#else -#define vld1q_s8_x3(__p0) __extension__ ({ \ - int8x16x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 32); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1q_f64_x3(__p0) __extension__ ({ \ float64x2x3_t __ret; \ __builtin_neon_vld1q_x3_v(&__ret, __p0, 42); \ @@ -52381,182 +55375,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1q_f32_x3(__p0) __extension__ ({ \ - float32x4x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \ - __ret; \ -}) -#else -#define vld1q_f32_x3(__p0) __extension__ ({ \ - float32x4x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 41); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_f16_x3(__p0) __extension__ ({ \ - float16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \ - __ret; \ -}) -#else -#define vld1q_f16_x3(__p0) __extension__ ({ \ - float16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 40); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s32_x3(__p0) __extension__ ({ \ - int32x4x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \ - __ret; \ -}) -#else -#define vld1q_s32_x3(__p0) __extension__ ({ \ - int32x4x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 34); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s64_x3(__p0) __extension__ ({ \ - int64x2x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \ - __ret; \ -}) -#else -#define vld1q_s64_x3(__p0) __extension__ ({ \ - int64x2x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 35); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s16_x3(__p0) __extension__ ({ \ - int16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \ - __ret; \ -}) -#else -#define vld1q_s16_x3(__p0) __extension__ ({ \ - int16x8x3_t __ret; \ - __builtin_neon_vld1q_x3_v(&__ret, __p0, 33); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u8_x3(__p0) __extension__ ({ \ - uint8x8x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \ - __ret; \ -}) -#else -#define vld1_u8_x3(__p0) __extension__ ({ \ - uint8x8x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 16); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u32_x3(__p0) __extension__ ({ \ - uint32x2x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \ - __ret; \ -}) -#else -#define vld1_u32_x3(__p0) __extension__ ({ \ - uint32x2x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 18); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u64_x3(__p0) __extension__ ({ \ - uint64x1x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \ - __ret; \ -}) -#else -#define vld1_u64_x3(__p0) __extension__ ({ \ - uint64x1x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 19); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u16_x3(__p0) __extension__ ({ \ - uint16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \ - __ret; \ -}) -#else -#define vld1_u16_x3(__p0) __extension__ ({ \ - uint16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 17); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s8_x3(__p0) __extension__ ({ \ - int8x8x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \ - __ret; \ -}) -#else -#define vld1_s8_x3(__p0) __extension__ ({ \ - int8x8x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 0); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1_f64_x3(__p0) __extension__ ({ \ float64x1x3_t __ret; \ __builtin_neon_vld1_x3_v(&__ret, __p0, 10); \ @@ -52571,111 +55389,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_f32_x3(__p0) __extension__ ({ \ - float32x2x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \ - __ret; \ -}) -#else -#define vld1_f32_x3(__p0) __extension__ ({ \ - float32x2x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 9); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_f16_x3(__p0) __extension__ ({ \ - float16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \ - __ret; \ -}) -#else -#define vld1_f16_x3(__p0) __extension__ ({ \ - float16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 8); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s32_x3(__p0) __extension__ ({ \ - int32x2x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \ - __ret; \ -}) -#else -#define vld1_s32_x3(__p0) __extension__ ({ \ - int32x2x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 2); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s64_x3(__p0) __extension__ ({ \ - int64x1x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \ - __ret; \ -}) -#else -#define vld1_s64_x3(__p0) __extension__ ({ \ - int64x1x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 3); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s16_x3(__p0) __extension__ ({ \ - int16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \ - __ret; \ -}) -#else -#define vld1_s16_x3(__p0) __extension__ ({ \ - int16x4x3_t __ret; \ - __builtin_neon_vld1_x3_v(&__ret, __p0, 1); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_p8_x4(__p0) __extension__ ({ \ - poly8x8x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \ - __ret; \ -}) -#else -#define vld1_p8_x4(__p0) __extension__ ({ \ - poly8x8x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 4); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1_p64_x4(__p0) __extension__ ({ \ poly64x1x4_t __ret; \ __builtin_neon_vld1_x4_v(&__ret, __p0, 6); \ @@ -52690,44 +55403,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_p16_x4(__p0) __extension__ ({ \ - poly16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \ - __ret; \ -}) -#else -#define vld1_p16_x4(__p0) __extension__ ({ \ - poly16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 5); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_p8_x4(__p0) __extension__ ({ \ - poly8x16x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \ - __ret; \ -}) -#else -#define vld1q_p8_x4(__p0) __extension__ ({ \ - poly8x16x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 36); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1q_p64_x4(__p0) __extension__ ({ \ poly64x2x4_t __ret; \ __builtin_neon_vld1q_x4_v(&__ret, __p0, 38); \ @@ -52747,120 +55422,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1q_p16_x4(__p0) __extension__ ({ \ - poly16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \ - __ret; \ -}) -#else -#define vld1q_p16_x4(__p0) __extension__ ({ \ - poly16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 37); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u8_x4(__p0) __extension__ ({ \ - uint8x16x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \ - __ret; \ -}) -#else -#define vld1q_u8_x4(__p0) __extension__ ({ \ - uint8x16x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 48); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u32_x4(__p0) __extension__ ({ \ - uint32x4x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \ - __ret; \ -}) -#else -#define vld1q_u32_x4(__p0) __extension__ ({ \ - uint32x4x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 50); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u64_x4(__p0) __extension__ ({ \ - uint64x2x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \ - __ret; \ -}) -#else -#define vld1q_u64_x4(__p0) __extension__ ({ \ - uint64x2x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 51); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_u16_x4(__p0) __extension__ ({ \ - uint16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \ - __ret; \ -}) -#else -#define vld1q_u16_x4(__p0) __extension__ ({ \ - uint16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 49); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s8_x4(__p0) __extension__ ({ \ - int8x16x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \ - __ret; \ -}) -#else -#define vld1q_s8_x4(__p0) __extension__ ({ \ - int8x16x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 32); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1q_f64_x4(__p0) __extension__ ({ \ float64x2x4_t __ret; \ __builtin_neon_vld1q_x4_v(&__ret, __p0, 42); \ @@ -52880,191 +55441,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1q_f32_x4(__p0) __extension__ ({ \ - float32x4x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \ - __ret; \ -}) -#else -#define vld1q_f32_x4(__p0) __extension__ ({ \ - float32x4x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 41); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_f16_x4(__p0) __extension__ ({ \ - float16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \ - __ret; \ -}) -#else -#define vld1q_f16_x4(__p0) __extension__ ({ \ - float16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 40); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s32_x4(__p0) __extension__ ({ \ - int32x4x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \ - __ret; \ -}) -#else -#define vld1q_s32_x4(__p0) __extension__ ({ \ - int32x4x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 34); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s64_x4(__p0) __extension__ ({ \ - int64x2x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \ - __ret; \ -}) -#else -#define vld1q_s64_x4(__p0) __extension__ ({ \ - int64x2x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 35); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1q_s16_x4(__p0) __extension__ ({ \ - int16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \ - __ret; \ -}) -#else -#define vld1q_s16_x4(__p0) __extension__ ({ \ - int16x8x4_t __ret; \ - __builtin_neon_vld1q_x4_v(&__ret, __p0, 33); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u8_x4(__p0) __extension__ ({ \ - uint8x8x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \ - __ret; \ -}) -#else -#define vld1_u8_x4(__p0) __extension__ ({ \ - uint8x8x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 16); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u32_x4(__p0) __extension__ ({ \ - uint32x2x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \ - __ret; \ -}) -#else -#define vld1_u32_x4(__p0) __extension__ ({ \ - uint32x2x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 18); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u64_x4(__p0) __extension__ ({ \ - uint64x1x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \ - __ret; \ -}) -#else -#define vld1_u64_x4(__p0) __extension__ ({ \ - uint64x1x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 19); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_u16_x4(__p0) __extension__ ({ \ - uint16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \ - __ret; \ -}) -#else -#define vld1_u16_x4(__p0) __extension__ ({ \ - uint16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 17); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s8_x4(__p0) __extension__ ({ \ - int8x8x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \ - __ret; \ -}) -#else -#define vld1_s8_x4(__p0) __extension__ ({ \ - int8x8x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 0); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld1_f64_x4(__p0) __extension__ ({ \ float64x1x4_t __ret; \ __builtin_neon_vld1_x4_v(&__ret, __p0, 10); \ @@ -53079,96 +55455,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld1_f32_x4(__p0) __extension__ ({ \ - float32x2x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \ - __ret; \ -}) -#else -#define vld1_f32_x4(__p0) __extension__ ({ \ - float32x2x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 9); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_f16_x4(__p0) __extension__ ({ \ - float16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \ - __ret; \ -}) -#else -#define vld1_f16_x4(__p0) __extension__ ({ \ - float16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 8); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s32_x4(__p0) __extension__ ({ \ - int32x2x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \ - __ret; \ -}) -#else -#define vld1_s32_x4(__p0) __extension__ ({ \ - int32x2x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 2); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s64_x4(__p0) __extension__ ({ \ - int64x1x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \ - __ret; \ -}) -#else -#define vld1_s64_x4(__p0) __extension__ ({ \ - int64x1x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 3); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld1_s16_x4(__p0) __extension__ ({ \ - int16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \ - __ret; \ -}) -#else -#define vld1_s16_x4(__p0) __extension__ ({ \ - int16x4x4_t __ret; \ - __builtin_neon_vld1_x4_v(&__ret, __p0, 1); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld2_p64(__p0) __extension__ ({ \ poly64x1x2_t __ret; \ __builtin_neon_vld2_v(&__ret, __p0, 6); \ @@ -53279,23 +55565,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_p8(__p0) __extension__ ({ \ - poly8x16x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \ - __ret; \ -}) -#else -#define vld2q_dup_p8(__p0) __extension__ ({ \ - poly8x16x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 36); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld2q_dup_p64(__p0) __extension__ ({ \ poly64x2x2_t __ret; \ __builtin_neon_vld2q_dup_v(&__ret, __p0, 38); \ @@ -53313,108 +55582,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_p16(__p0) __extension__ ({ \ - poly16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \ - __ret; \ -}) -#else -#define vld2q_dup_p16(__p0) __extension__ ({ \ - poly16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 37); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_u8(__p0) __extension__ ({ \ - uint8x16x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \ - __ret; \ -}) -#else -#define vld2q_dup_u8(__p0) __extension__ ({ \ - uint8x16x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 48); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_u32(__p0) __extension__ ({ \ - uint32x4x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \ - __ret; \ -}) -#else -#define vld2q_dup_u32(__p0) __extension__ ({ \ - uint32x4x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 50); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_u64(__p0) __extension__ ({ \ - uint64x2x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \ - __ret; \ -}) -#else -#define vld2q_dup_u64(__p0) __extension__ ({ \ - uint64x2x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 51); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_u16(__p0) __extension__ ({ \ - uint16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \ - __ret; \ -}) -#else -#define vld2q_dup_u16(__p0) __extension__ ({ \ - uint16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 49); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_s8(__p0) __extension__ ({ \ - int8x16x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \ - __ret; \ -}) -#else -#define vld2q_dup_s8(__p0) __extension__ ({ \ - int8x16x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 32); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld2q_dup_f64(__p0) __extension__ ({ \ float64x2x2_t __ret; \ __builtin_neon_vld2q_dup_v(&__ret, __p0, 42); \ @@ -53432,91 +55599,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_f32(__p0) __extension__ ({ \ - float32x4x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \ - __ret; \ -}) -#else -#define vld2q_dup_f32(__p0) __extension__ ({ \ - float32x4x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 41); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_f16(__p0) __extension__ ({ \ - float16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \ - __ret; \ -}) -#else -#define vld2q_dup_f16(__p0) __extension__ ({ \ - float16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 40); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_s32(__p0) __extension__ ({ \ - int32x4x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \ - __ret; \ -}) -#else -#define vld2q_dup_s32(__p0) __extension__ ({ \ - int32x4x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 34); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_s64(__p0) __extension__ ({ \ - int64x2x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \ - __ret; \ -}) -#else -#define vld2q_dup_s64(__p0) __extension__ ({ \ - int64x2x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 35); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld2q_dup_s16(__p0) __extension__ ({ \ - int16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \ - __ret; \ -}) -#else -#define vld2q_dup_s16(__p0) __extension__ ({ \ - int16x8x2_t __ret; \ - __builtin_neon_vld2q_dup_v(&__ret, __p0, 33); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld2_dup_f64(__p0) __extension__ ({ \ float64x1x2_t __ret; \ __builtin_neon_vld2_dup_v(&__ret, __p0, 10); \ @@ -53863,24 +55945,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_p8(__p0) __extension__ ({ \ - poly8x16x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \ - __ret; \ -}) -#else -#define vld3q_dup_p8(__p0) __extension__ ({ \ - poly8x16x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 36); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld3q_dup_p64(__p0) __extension__ ({ \ poly64x2x3_t __ret; \ __builtin_neon_vld3q_dup_v(&__ret, __p0, 38); \ @@ -53899,114 +55963,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_p16(__p0) __extension__ ({ \ - poly16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \ - __ret; \ -}) -#else -#define vld3q_dup_p16(__p0) __extension__ ({ \ - poly16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 37); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_u8(__p0) __extension__ ({ \ - uint8x16x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \ - __ret; \ -}) -#else -#define vld3q_dup_u8(__p0) __extension__ ({ \ - uint8x16x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 48); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_u32(__p0) __extension__ ({ \ - uint32x4x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \ - __ret; \ -}) -#else -#define vld3q_dup_u32(__p0) __extension__ ({ \ - uint32x4x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 50); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_u64(__p0) __extension__ ({ \ - uint64x2x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \ - __ret; \ -}) -#else -#define vld3q_dup_u64(__p0) __extension__ ({ \ - uint64x2x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 51); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_u16(__p0) __extension__ ({ \ - uint16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \ - __ret; \ -}) -#else -#define vld3q_dup_u16(__p0) __extension__ ({ \ - uint16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 49); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_s8(__p0) __extension__ ({ \ - int8x16x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \ - __ret; \ -}) -#else -#define vld3q_dup_s8(__p0) __extension__ ({ \ - int8x16x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 32); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld3q_dup_f64(__p0) __extension__ ({ \ float64x2x3_t __ret; \ __builtin_neon_vld3q_dup_v(&__ret, __p0, 42); \ @@ -54025,96 +55981,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_f32(__p0) __extension__ ({ \ - float32x4x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \ - __ret; \ -}) -#else -#define vld3q_dup_f32(__p0) __extension__ ({ \ - float32x4x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 41); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_f16(__p0) __extension__ ({ \ - float16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \ - __ret; \ -}) -#else -#define vld3q_dup_f16(__p0) __extension__ ({ \ - float16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 40); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_s32(__p0) __extension__ ({ \ - int32x4x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \ - __ret; \ -}) -#else -#define vld3q_dup_s32(__p0) __extension__ ({ \ - int32x4x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 34); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_s64(__p0) __extension__ ({ \ - int64x2x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \ - __ret; \ -}) -#else -#define vld3q_dup_s64(__p0) __extension__ ({ \ - int64x2x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 35); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld3q_dup_s16(__p0) __extension__ ({ \ - int16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \ - __ret; \ -}) -#else -#define vld3q_dup_s16(__p0) __extension__ ({ \ - int16x8x3_t __ret; \ - __builtin_neon_vld3q_dup_v(&__ret, __p0, 33); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld3_dup_f64(__p0) __extension__ ({ \ float64x1x3_t __ret; \ __builtin_neon_vld3_dup_v(&__ret, __p0, 10); \ @@ -54479,25 +56345,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_p8(__p0) __extension__ ({ \ - poly8x16x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \ - __ret; \ -}) -#else -#define vld4q_dup_p8(__p0) __extension__ ({ \ - poly8x16x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 36); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld4q_dup_p64(__p0) __extension__ ({ \ poly64x2x4_t __ret; \ __builtin_neon_vld4q_dup_v(&__ret, __p0, 38); \ @@ -54517,120 +56364,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_p16(__p0) __extension__ ({ \ - poly16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \ - __ret; \ -}) -#else -#define vld4q_dup_p16(__p0) __extension__ ({ \ - poly16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 37); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_u8(__p0) __extension__ ({ \ - uint8x16x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \ - __ret; \ -}) -#else -#define vld4q_dup_u8(__p0) __extension__ ({ \ - uint8x16x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 48); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_u32(__p0) __extension__ ({ \ - uint32x4x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \ - __ret; \ -}) -#else -#define vld4q_dup_u32(__p0) __extension__ ({ \ - uint32x4x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 50); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_u64(__p0) __extension__ ({ \ - uint64x2x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \ - __ret; \ -}) -#else -#define vld4q_dup_u64(__p0) __extension__ ({ \ - uint64x2x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 51); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_u16(__p0) __extension__ ({ \ - uint16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \ - __ret; \ -}) -#else -#define vld4q_dup_u16(__p0) __extension__ ({ \ - uint16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 49); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_s8(__p0) __extension__ ({ \ - int8x16x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \ - __ret; \ -}) -#else -#define vld4q_dup_s8(__p0) __extension__ ({ \ - int8x16x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 32); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld4q_dup_f64(__p0) __extension__ ({ \ float64x2x4_t __ret; \ __builtin_neon_vld4q_dup_v(&__ret, __p0, 42); \ @@ -54650,101 +56383,6 @@ __ai float64x1_t vget_low_f64(float64x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_f32(__p0) __extension__ ({ \ - float32x4x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \ - __ret; \ -}) -#else -#define vld4q_dup_f32(__p0) __extension__ ({ \ - float32x4x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 41); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_f16(__p0) __extension__ ({ \ - float16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \ - __ret; \ -}) -#else -#define vld4q_dup_f16(__p0) __extension__ ({ \ - float16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 40); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_s32(__p0) __extension__ ({ \ - int32x4x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \ - __ret; \ -}) -#else -#define vld4q_dup_s32(__p0) __extension__ ({ \ - int32x4x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 34); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_s64(__p0) __extension__ ({ \ - int64x2x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \ - __ret; \ -}) -#else -#define vld4q_dup_s64(__p0) __extension__ ({ \ - int64x2x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 35); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vld4q_dup_s16(__p0) __extension__ ({ \ - int16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \ - __ret; \ -}) -#else -#define vld4q_dup_s16(__p0) __extension__ ({ \ - int16x8x4_t __ret; \ - __builtin_neon_vld4q_dup_v(&__ret, __p0, 33); \ - \ - __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[2] = __builtin_shufflevector(__ret.val[2], __ret.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret.val[3] = __builtin_shufflevector(__ret.val[3], __ret.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __ret; \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vld4_dup_f64(__p0) __extension__ ({ \ float64x1x4_t __ret; \ __builtin_neon_vld4_dup_v(&__ret, __p0, 10); \ @@ -64931,21 +66569,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_p8_x2(__p0, __p1) __extension__ ({ \ - poly8x8x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 4); \ -}) -#else -#define vst1_p8_x2(__p0, __p1) __extension__ ({ \ - poly8x8x2_t __s1 = __p1; \ - poly8x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 4); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1_p64_x2(__p0, __p1) __extension__ ({ \ poly64x1x2_t __s1 = __p1; \ __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \ @@ -64958,36 +66581,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_p16_x2(__p0, __p1) __extension__ ({ \ - poly16x4x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 5); \ -}) -#else -#define vst1_p16_x2(__p0, __p1) __extension__ ({ \ - poly16x4x2_t __s1 = __p1; \ - poly16x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 5); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \ - poly8x16x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 36); \ -}) -#else -#define vst1q_p8_x2(__p0, __p1) __extension__ ({ \ - poly8x16x2_t __s1 = __p1; \ - poly8x16x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 36); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1q_p64_x2(__p0, __p1) __extension__ ({ \ poly64x2x2_t __s1 = __p1; \ __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 38); \ @@ -65003,96 +66596,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \ - poly16x8x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 37); \ -}) -#else -#define vst1q_p16_x2(__p0, __p1) __extension__ ({ \ - poly16x8x2_t __s1 = __p1; \ - poly16x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 37); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \ - uint8x16x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 48); \ -}) -#else -#define vst1q_u8_x2(__p0, __p1) __extension__ ({ \ - uint8x16x2_t __s1 = __p1; \ - uint8x16x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 48); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \ - uint32x4x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 50); \ -}) -#else -#define vst1q_u32_x2(__p0, __p1) __extension__ ({ \ - uint32x4x2_t __s1 = __p1; \ - uint32x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 50); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \ - uint64x2x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 51); \ -}) -#else -#define vst1q_u64_x2(__p0, __p1) __extension__ ({ \ - uint64x2x2_t __s1 = __p1; \ - uint64x2x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 51); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \ - uint16x8x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 49); \ -}) -#else -#define vst1q_u16_x2(__p0, __p1) __extension__ ({ \ - uint16x8x2_t __s1 = __p1; \ - uint16x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 49); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \ - int8x16x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], 32); \ -}) -#else -#define vst1q_s8_x2(__p0, __p1) __extension__ ({ \ - int8x16x2_t __s1 = __p1; \ - int8x16x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], 32); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1q_f64_x2(__p0, __p1) __extension__ ({ \ float64x2x2_t __s1 = __p1; \ __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 42); \ @@ -65108,153 +66611,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \ - float32x4x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 41); \ -}) -#else -#define vst1q_f32_x2(__p0, __p1) __extension__ ({ \ - float32x4x2_t __s1 = __p1; \ - float32x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 41); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \ - float16x8x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 40); \ -}) -#else -#define vst1q_f16_x2(__p0, __p1) __extension__ ({ \ - float16x8x2_t __s1 = __p1; \ - float16x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 40); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \ - int32x4x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 34); \ -}) -#else -#define vst1q_s32_x2(__p0, __p1) __extension__ ({ \ - int32x4x2_t __s1 = __p1; \ - int32x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 34); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \ - int64x2x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 35); \ -}) -#else -#define vst1q_s64_x2(__p0, __p1) __extension__ ({ \ - int64x2x2_t __s1 = __p1; \ - int64x2x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 35); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \ - int16x8x2_t __s1 = __p1; \ - __builtin_neon_vst1q_x2_v(__p0, __s1.val[0], __s1.val[1], 33); \ -}) -#else -#define vst1q_s16_x2(__p0, __p1) __extension__ ({ \ - int16x8x2_t __s1 = __p1; \ - int16x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x2_v(__p0, __rev1.val[0], __rev1.val[1], 33); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u8_x2(__p0, __p1) __extension__ ({ \ - uint8x8x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 16); \ -}) -#else -#define vst1_u8_x2(__p0, __p1) __extension__ ({ \ - uint8x8x2_t __s1 = __p1; \ - uint8x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 16); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u32_x2(__p0, __p1) __extension__ ({ \ - uint32x2x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 18); \ -}) -#else -#define vst1_u32_x2(__p0, __p1) __extension__ ({ \ - uint32x2x2_t __s1 = __p1; \ - uint32x2x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 18); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u64_x2(__p0, __p1) __extension__ ({ \ - uint64x1x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \ -}) -#else -#define vst1_u64_x2(__p0, __p1) __extension__ ({ \ - uint64x1x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 19); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u16_x2(__p0, __p1) __extension__ ({ \ - uint16x4x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 17); \ -}) -#else -#define vst1_u16_x2(__p0, __p1) __extension__ ({ \ - uint16x4x2_t __s1 = __p1; \ - uint16x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 17); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s8_x2(__p0, __p1) __extension__ ({ \ - int8x8x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 0); \ -}) -#else -#define vst1_s8_x2(__p0, __p1) __extension__ ({ \ - int8x8x2_t __s1 = __p1; \ - int8x8x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], 0); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1_f64_x2(__p0, __p1) __extension__ ({ \ float64x1x2_t __s1 = __p1; \ __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 10); \ @@ -65267,94 +66623,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_f32_x2(__p0, __p1) __extension__ ({ \ - float32x2x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 9); \ -}) -#else -#define vst1_f32_x2(__p0, __p1) __extension__ ({ \ - float32x2x2_t __s1 = __p1; \ - float32x2x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 9); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_f16_x2(__p0, __p1) __extension__ ({ \ - float16x4x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 8); \ -}) -#else -#define vst1_f16_x2(__p0, __p1) __extension__ ({ \ - float16x4x2_t __s1 = __p1; \ - float16x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 8); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s32_x2(__p0, __p1) __extension__ ({ \ - int32x2x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 2); \ -}) -#else -#define vst1_s32_x2(__p0, __p1) __extension__ ({ \ - int32x2x2_t __s1 = __p1; \ - int32x2x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 2); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s64_x2(__p0, __p1) __extension__ ({ \ - int64x1x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \ -}) -#else -#define vst1_s64_x2(__p0, __p1) __extension__ ({ \ - int64x1x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 3); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s16_x2(__p0, __p1) __extension__ ({ \ - int16x4x2_t __s1 = __p1; \ - __builtin_neon_vst1_x2_v(__p0, __s1.val[0], __s1.val[1], 1); \ -}) -#else -#define vst1_s16_x2(__p0, __p1) __extension__ ({ \ - int16x4x2_t __s1 = __p1; \ - int16x4x2_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __builtin_neon_vst1_x2_v(__p0, __rev1.val[0], __rev1.val[1], 1); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_p8_x3(__p0, __p1) __extension__ ({ \ - poly8x8x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 4); \ -}) -#else -#define vst1_p8_x3(__p0, __p1) __extension__ ({ \ - poly8x8x3_t __s1 = __p1; \ - poly8x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 4); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1_p64_x3(__p0, __p1) __extension__ ({ \ poly64x1x3_t __s1 = __p1; \ __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 6); \ @@ -65367,38 +66635,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_p16_x3(__p0, __p1) __extension__ ({ \ - poly16x4x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 5); \ -}) -#else -#define vst1_p16_x3(__p0, __p1) __extension__ ({ \ - poly16x4x3_t __s1 = __p1; \ - poly16x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 5); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \ - poly8x16x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 36); \ -}) -#else -#define vst1q_p8_x3(__p0, __p1) __extension__ ({ \ - poly8x16x3_t __s1 = __p1; \ - poly8x16x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 36); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1q_p64_x3(__p0, __p1) __extension__ ({ \ poly64x2x3_t __s1 = __p1; \ __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 38); \ @@ -65415,102 +66651,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \ - poly16x8x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 37); \ -}) -#else -#define vst1q_p16_x3(__p0, __p1) __extension__ ({ \ - poly16x8x3_t __s1 = __p1; \ - poly16x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 37); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \ - uint8x16x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 48); \ -}) -#else -#define vst1q_u8_x3(__p0, __p1) __extension__ ({ \ - uint8x16x3_t __s1 = __p1; \ - uint8x16x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 48); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \ - uint32x4x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 50); \ -}) -#else -#define vst1q_u32_x3(__p0, __p1) __extension__ ({ \ - uint32x4x3_t __s1 = __p1; \ - uint32x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 50); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \ - uint64x2x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 51); \ -}) -#else -#define vst1q_u64_x3(__p0, __p1) __extension__ ({ \ - uint64x2x3_t __s1 = __p1; \ - uint64x2x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 51); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \ - uint16x8x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 49); \ -}) -#else -#define vst1q_u16_x3(__p0, __p1) __extension__ ({ \ - uint16x8x3_t __s1 = __p1; \ - uint16x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 49); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \ - int8x16x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], 32); \ -}) -#else -#define vst1q_s8_x3(__p0, __p1) __extension__ ({ \ - int8x16x3_t __s1 = __p1; \ - int8x16x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], 32); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1q_f64_x3(__p0, __p1) __extension__ ({ \ float64x2x3_t __s1 = __p1; \ __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 42); \ @@ -65527,162 +66667,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \ - float32x4x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 41); \ -}) -#else -#define vst1q_f32_x3(__p0, __p1) __extension__ ({ \ - float32x4x3_t __s1 = __p1; \ - float32x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 41); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \ - float16x8x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 40); \ -}) -#else -#define vst1q_f16_x3(__p0, __p1) __extension__ ({ \ - float16x8x3_t __s1 = __p1; \ - float16x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 40); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \ - int32x4x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 34); \ -}) -#else -#define vst1q_s32_x3(__p0, __p1) __extension__ ({ \ - int32x4x3_t __s1 = __p1; \ - int32x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 34); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \ - int64x2x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 35); \ -}) -#else -#define vst1q_s64_x3(__p0, __p1) __extension__ ({ \ - int64x2x3_t __s1 = __p1; \ - int64x2x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 35); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \ - int16x8x3_t __s1 = __p1; \ - __builtin_neon_vst1q_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 33); \ -}) -#else -#define vst1q_s16_x3(__p0, __p1) __extension__ ({ \ - int16x8x3_t __s1 = __p1; \ - int16x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 33); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u8_x3(__p0, __p1) __extension__ ({ \ - uint8x8x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 16); \ -}) -#else -#define vst1_u8_x3(__p0, __p1) __extension__ ({ \ - uint8x8x3_t __s1 = __p1; \ - uint8x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 16); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u32_x3(__p0, __p1) __extension__ ({ \ - uint32x2x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 18); \ -}) -#else -#define vst1_u32_x3(__p0, __p1) __extension__ ({ \ - uint32x2x3_t __s1 = __p1; \ - uint32x2x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 18); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u64_x3(__p0, __p1) __extension__ ({ \ - uint64x1x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \ -}) -#else -#define vst1_u64_x3(__p0, __p1) __extension__ ({ \ - uint64x1x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 19); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u16_x3(__p0, __p1) __extension__ ({ \ - uint16x4x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 17); \ -}) -#else -#define vst1_u16_x3(__p0, __p1) __extension__ ({ \ - uint16x4x3_t __s1 = __p1; \ - uint16x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 17); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s8_x3(__p0, __p1) __extension__ ({ \ - int8x8x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], 0); \ -}) -#else -#define vst1_s8_x3(__p0, __p1) __extension__ ({ \ - int8x8x3_t __s1 = __p1; \ - int8x8x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], 0); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1_f64_x3(__p0, __p1) __extension__ ({ \ float64x1x3_t __s1 = __p1; \ __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 10); \ @@ -65695,99 +66679,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_f32_x3(__p0, __p1) __extension__ ({ \ - float32x2x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 9); \ -}) -#else -#define vst1_f32_x3(__p0, __p1) __extension__ ({ \ - float32x2x3_t __s1 = __p1; \ - float32x2x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 9); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_f16_x3(__p0, __p1) __extension__ ({ \ - float16x4x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 8); \ -}) -#else -#define vst1_f16_x3(__p0, __p1) __extension__ ({ \ - float16x4x3_t __s1 = __p1; \ - float16x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 8); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s32_x3(__p0, __p1) __extension__ ({ \ - int32x2x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 2); \ -}) -#else -#define vst1_s32_x3(__p0, __p1) __extension__ ({ \ - int32x2x3_t __s1 = __p1; \ - int32x2x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 2); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s64_x3(__p0, __p1) __extension__ ({ \ - int64x1x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \ -}) -#else -#define vst1_s64_x3(__p0, __p1) __extension__ ({ \ - int64x1x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 3); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s16_x3(__p0, __p1) __extension__ ({ \ - int16x4x3_t __s1 = __p1; \ - __builtin_neon_vst1_x3_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], 1); \ -}) -#else -#define vst1_s16_x3(__p0, __p1) __extension__ ({ \ - int16x4x3_t __s1 = __p1; \ - int16x4x3_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __builtin_neon_vst1_x3_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], 1); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_p8_x4(__p0, __p1) __extension__ ({ \ - poly8x8x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 4); \ -}) -#else -#define vst1_p8_x4(__p0, __p1) __extension__ ({ \ - poly8x8x4_t __s1 = __p1; \ - poly8x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 4); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1_p64_x4(__p0, __p1) __extension__ ({ \ poly64x1x4_t __s1 = __p1; \ __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 6); \ @@ -65800,40 +66691,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_p16_x4(__p0, __p1) __extension__ ({ \ - poly16x4x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 5); \ -}) -#else -#define vst1_p16_x4(__p0, __p1) __extension__ ({ \ - poly16x4x4_t __s1 = __p1; \ - poly16x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 5); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \ - poly8x16x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 36); \ -}) -#else -#define vst1q_p8_x4(__p0, __p1) __extension__ ({ \ - poly8x16x4_t __s1 = __p1; \ - poly8x16x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 36); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1q_p64_x4(__p0, __p1) __extension__ ({ \ poly64x2x4_t __s1 = __p1; \ __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 38); \ @@ -65851,108 +66708,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \ - poly16x8x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 37); \ -}) -#else -#define vst1q_p16_x4(__p0, __p1) __extension__ ({ \ - poly16x8x4_t __s1 = __p1; \ - poly16x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 37); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \ - uint8x16x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 48); \ -}) -#else -#define vst1q_u8_x4(__p0, __p1) __extension__ ({ \ - uint8x16x4_t __s1 = __p1; \ - uint8x16x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 48); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \ - uint32x4x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 50); \ -}) -#else -#define vst1q_u32_x4(__p0, __p1) __extension__ ({ \ - uint32x4x4_t __s1 = __p1; \ - uint32x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 50); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \ - uint64x2x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 51); \ -}) -#else -#define vst1q_u64_x4(__p0, __p1) __extension__ ({ \ - uint64x2x4_t __s1 = __p1; \ - uint64x2x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 51); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \ - uint16x8x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 49); \ -}) -#else -#define vst1q_u16_x4(__p0, __p1) __extension__ ({ \ - uint16x8x4_t __s1 = __p1; \ - uint16x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 49); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \ - int8x16x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__s1.val[0], (int8x16_t)__s1.val[1], (int8x16_t)__s1.val[2], (int8x16_t)__s1.val[3], 32); \ -}) -#else -#define vst1q_s8_x4(__p0, __p1) __extension__ ({ \ - int8x16x4_t __s1 = __p1; \ - int8x16x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, (int8x16_t)__rev1.val[0], (int8x16_t)__rev1.val[1], (int8x16_t)__rev1.val[2], (int8x16_t)__rev1.val[3], 32); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1q_f64_x4(__p0, __p1) __extension__ ({ \ float64x2x4_t __s1 = __p1; \ __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 42); \ @@ -65970,171 +66725,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \ - float32x4x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 41); \ -}) -#else -#define vst1q_f32_x4(__p0, __p1) __extension__ ({ \ - float32x4x4_t __s1 = __p1; \ - float32x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 41); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \ - float16x8x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 40); \ -}) -#else -#define vst1q_f16_x4(__p0, __p1) __extension__ ({ \ - float16x8x4_t __s1 = __p1; \ - float16x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 40); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \ - int32x4x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 34); \ -}) -#else -#define vst1q_s32_x4(__p0, __p1) __extension__ ({ \ - int32x4x4_t __s1 = __p1; \ - int32x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 34); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \ - int64x2x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 35); \ -}) -#else -#define vst1q_s64_x4(__p0, __p1) __extension__ ({ \ - int64x2x4_t __s1 = __p1; \ - int64x2x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 35); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \ - int16x8x4_t __s1 = __p1; \ - __builtin_neon_vst1q_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 33); \ -}) -#else -#define vst1q_s16_x4(__p0, __p1) __extension__ ({ \ - int16x8x4_t __s1 = __p1; \ - int16x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1q_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 33); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u8_x4(__p0, __p1) __extension__ ({ \ - uint8x8x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 16); \ -}) -#else -#define vst1_u8_x4(__p0, __p1) __extension__ ({ \ - uint8x8x4_t __s1 = __p1; \ - uint8x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 16); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u32_x4(__p0, __p1) __extension__ ({ \ - uint32x2x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 18); \ -}) -#else -#define vst1_u32_x4(__p0, __p1) __extension__ ({ \ - uint32x2x4_t __s1 = __p1; \ - uint32x2x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 18); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u64_x4(__p0, __p1) __extension__ ({ \ - uint64x1x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \ -}) -#else -#define vst1_u64_x4(__p0, __p1) __extension__ ({ \ - uint64x1x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 19); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_u16_x4(__p0, __p1) __extension__ ({ \ - uint16x4x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 17); \ -}) -#else -#define vst1_u16_x4(__p0, __p1) __extension__ ({ \ - uint16x4x4_t __s1 = __p1; \ - uint16x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 17); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s8_x4(__p0, __p1) __extension__ ({ \ - int8x8x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], (int8x8_t)__s1.val[2], (int8x8_t)__s1.val[3], 0); \ -}) -#else -#define vst1_s8_x4(__p0, __p1) __extension__ ({ \ - int8x8x4_t __s1 = __p1; \ - int8x8x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 7, 6, 5, 4, 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 7, 6, 5, 4, 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, (int8x8_t)__rev1.val[0], (int8x8_t)__rev1.val[1], (int8x8_t)__rev1.val[2], (int8x8_t)__rev1.val[3], 0); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst1_f64_x4(__p0, __p1) __extension__ ({ \ float64x1x4_t __s1 = __p1; \ __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 10); \ @@ -66147,86 +66737,6 @@ __ai float32x2_t vsqrt_f32(float32x2_t __p0) { #endif #ifdef __LITTLE_ENDIAN__ -#define vst1_f32_x4(__p0, __p1) __extension__ ({ \ - float32x2x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 9); \ -}) -#else -#define vst1_f32_x4(__p0, __p1) __extension__ ({ \ - float32x2x4_t __s1 = __p1; \ - float32x2x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 9); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_f16_x4(__p0, __p1) __extension__ ({ \ - float16x4x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 8); \ -}) -#else -#define vst1_f16_x4(__p0, __p1) __extension__ ({ \ - float16x4x4_t __s1 = __p1; \ - float16x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 8); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s32_x4(__p0, __p1) __extension__ ({ \ - int32x2x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 2); \ -}) -#else -#define vst1_s32_x4(__p0, __p1) __extension__ ({ \ - int32x2x4_t __s1 = __p1; \ - int32x2x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 2); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s64_x4(__p0, __p1) __extension__ ({ \ - int64x1x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \ -}) -#else -#define vst1_s64_x4(__p0, __p1) __extension__ ({ \ - int64x1x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 3); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ -#define vst1_s16_x4(__p0, __p1) __extension__ ({ \ - int16x4x4_t __s1 = __p1; \ - __builtin_neon_vst1_x4_v(__p0, __s1.val[0], __s1.val[1], __s1.val[2], __s1.val[3], 1); \ -}) -#else -#define vst1_s16_x4(__p0, __p1) __extension__ ({ \ - int16x4x4_t __s1 = __p1; \ - int16x4x4_t __rev1; \ - __rev1.val[0] = __builtin_shufflevector(__s1.val[0], __s1.val[0], 3, 2, 1, 0); \ - __rev1.val[1] = __builtin_shufflevector(__s1.val[1], __s1.val[1], 3, 2, 1, 0); \ - __rev1.val[2] = __builtin_shufflevector(__s1.val[2], __s1.val[2], 3, 2, 1, 0); \ - __rev1.val[3] = __builtin_shufflevector(__s1.val[3], __s1.val[3], 3, 2, 1, 0); \ - __builtin_neon_vst1_x4_v(__p0, __rev1.val[0], __rev1.val[1], __rev1.val[2], __rev1.val[3], 1); \ -}) -#endif - -#ifdef __LITTLE_ENDIAN__ #define vst2_p64(__p0, __p1) __extension__ ({ \ poly64x1x2_t __s1 = __p1; \ __builtin_neon_vst2_v(__p0, (int8x8_t)__s1.val[0], (int8x8_t)__s1.val[1], 6); \ diff --git a/c_headers/avx2intrin.h b/c_headers/avx2intrin.h index caf4ced920..9688a96fde 100644 --- a/c_headers/avx2intrin.h +++ b/c_headers/avx2intrin.h @@ -29,120 +29,121 @@ #define __AVX2INTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2"))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128))) /* SSE4 Multiple Packed Sums of Absolute Difference. */ #define _mm256_mpsadbw_epu8(X, Y, M) \ (__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ (__v32qi)(__m256i)(Y), (int)(M)) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a) { return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a) { return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32(__m256i __a) { return (__m256i)__builtin_ia32_pabsd256((__v8si)__a); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packs_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_packus_epi32(__m256i __V1, __m256i __V2) { return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a, __m256i __b) { return (__m256i)((__v32qu)__a + (__v32qu)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a + (__v16hu)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a + (__v8su)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a + (__v4du)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_adds_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); } -#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \ +#define _mm256_alignr_epi8(a, b, n) \ (__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (n)); }) + (__v32qi)(__m256i)(b), (n)) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_and_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a & (__v4du)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_andnot_si256(__m256i __a, __m256i __b) { return (__m256i)(~(__v4du)__a & (__v4du)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a, __m256i __b) { typedef unsigned short __v32hu __attribute__((__vector_size__(64))); @@ -152,7 +153,7 @@ _mm256_avg_epu8(__m256i __a, __m256i __b) >> 1, __v32qu); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a, __m256i __b) { typedef unsigned int __v16su __attribute__((__vector_size__(64))); @@ -162,58 +163,42 @@ _mm256_avg_epu16(__m256i __a, __m256i __b) >> 1, __v16hu); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) { return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, (__v32qi)__M); } -#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1), \ - (__v16hi)(__m256i)(V2), \ - (((M) & 0x01) ? 16 : 0), \ - (((M) & 0x02) ? 17 : 1), \ - (((M) & 0x04) ? 18 : 2), \ - (((M) & 0x08) ? 19 : 3), \ - (((M) & 0x10) ? 20 : 4), \ - (((M) & 0x20) ? 21 : 5), \ - (((M) & 0x40) ? 22 : 6), \ - (((M) & 0x80) ? 23 : 7), \ - (((M) & 0x01) ? 24 : 8), \ - (((M) & 0x02) ? 25 : 9), \ - (((M) & 0x04) ? 26 : 10), \ - (((M) & 0x08) ? 27 : 11), \ - (((M) & 0x10) ? 28 : 12), \ - (((M) & 0x20) ? 29 : 13), \ - (((M) & 0x40) ? 30 : 14), \ - (((M) & 0x80) ? 31 : 15)); }) - -static __inline__ __m256i __DEFAULT_FN_ATTRS +#define _mm256_blend_epi16(V1, V2, M) \ + (__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ + (__v16hi)(__m256i)(V2), (int)(M)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi8(__m256i __a, __m256i __b) { return (__m256i)((__v32qi)__a == (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hi)__a == (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8si)__a == (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpeq_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4di)__a == (__v4di)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi8(__m256i __a, __m256i __b) { /* This function always performs a signed comparison, but __v32qi is a char @@ -221,151 +206,151 @@ _mm256_cmpgt_epi8(__m256i __a, __m256i __b) return (__m256i)((__v32qs)__a > (__v32qs)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hi)__a > (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8si)__a > (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmpgt_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4di)__a > (__v4di)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadds_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maddubs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a) { return __builtin_ia32_pmovmskb256((__v32qi)__a); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi16(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char @@ -373,7 +358,7 @@ _mm256_cvtepi8_epi16(__m128i __V) return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi32(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char @@ -381,7 +366,7 @@ _mm256_cvtepi8_epi32(__m128i __V) return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi8_epi64(__m128i __V) { /* This function always performs a signed extension, but __v16qi is a char @@ -389,920 +374,795 @@ _mm256_cvtepi8_epi64(__m128i __V) return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi32(__m128i __V) { return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi64(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi64(__m128i __V) { return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi16(__m128i __V) { return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi32(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu8_epi64(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi32(__m128i __V) { return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu16_epi64(__m128i __V) { return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_epi64(__m128i __V) { return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhrs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mulhi_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a * (__v16hu)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi32 (__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a * (__v8su)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mul_epu32(__m256i __a, __m256i __b) { return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_or_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a | (__v4du)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a, __m256i __b) { return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shuffle_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); } -#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \ - (__v8si)_mm256_undefined_si256(), \ - 0 + (((imm) >> 0) & 0x3), \ - 0 + (((imm) >> 2) & 0x3), \ - 0 + (((imm) >> 4) & 0x3), \ - 0 + (((imm) >> 6) & 0x3), \ - 4 + (((imm) >> 0) & 0x3), \ - 4 + (((imm) >> 2) & 0x3), \ - 4 + (((imm) >> 4) & 0x3), \ - 4 + (((imm) >> 6) & 0x3)); }) - -#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \ - (__v16hi)_mm256_undefined_si256(), \ - 0, 1, 2, 3, \ - 4 + (((imm) >> 0) & 0x3), \ - 4 + (((imm) >> 2) & 0x3), \ - 4 + (((imm) >> 4) & 0x3), \ - 4 + (((imm) >> 6) & 0x3), \ - 8, 9, 10, 11, \ - 12 + (((imm) >> 0) & 0x3), \ - 12 + (((imm) >> 2) & 0x3), \ - 12 + (((imm) >> 4) & 0x3), \ - 12 + (((imm) >> 6) & 0x3)); }) - -#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \ - (__v16hi)_mm256_undefined_si256(), \ - 0 + (((imm) >> 0) & 0x3), \ - 0 + (((imm) >> 2) & 0x3), \ - 0 + (((imm) >> 4) & 0x3), \ - 0 + (((imm) >> 6) & 0x3), \ - 4, 5, 6, 7, \ - 8 + (((imm) >> 0) & 0x3), \ - 8 + (((imm) >> 2) & 0x3), \ - 8 + (((imm) >> 4) & 0x3), \ - 8 + (((imm) >> 6) & 0x3), \ - 12, 13, 14, 15); }) - -static __inline__ __m256i __DEFAULT_FN_ATTRS +#define _mm256_shuffle_epi32(a, imm) \ + (__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)) + +#define _mm256_shufflehi_epi16(a, imm) \ + (__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)) + +#define _mm256_shufflelo_epi16(a, imm) \ + (__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); } -#define _mm256_slli_si256(a, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector( \ - (__v32qi)_mm256_setzero_si256(), \ - (__v32qi)(__m256i)(a), \ - ((char)(imm)&0xF0) ? 0 : ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \ - ((char)(imm)&0xF0) ? 1 : ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \ - ((char)(imm)&0xF0) ? 2 : ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \ - ((char)(imm)&0xF0) ? 3 : ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \ - ((char)(imm)&0xF0) ? 4 : ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \ - ((char)(imm)&0xF0) ? 5 : ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \ - ((char)(imm)&0xF0) ? 6 : ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \ - ((char)(imm)&0xF0) ? 7 : ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \ - ((char)(imm)&0xF0) ? 8 : ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \ - ((char)(imm)&0xF0) ? 9 : ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \ - ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \ - ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \ - ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \ - ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \ - ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \ - ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \ - ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \ - ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \ - ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \ - ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \ - ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \ - ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \ - ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \ - ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \ - ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \ - ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \ - ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \ - ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \ - ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \ - ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \ - ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \ - ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); }) - -#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count)) - -static __inline__ __m256i __DEFAULT_FN_ATTRS +#define _mm256_slli_si256(a, imm) \ + (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)) + +#define _mm256_bslli_epi128(a, imm) \ + (__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a, int __count) { return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64(__m256i __a, int __count) { return __builtin_ia32_psllqi256((__v4di)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sll_epi64(__m256i __a, __m128i __count) { return __builtin_ia32_psllq256((__v4di)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); } -#define _mm256_srli_si256(a, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector( \ - (__v32qi)(__m256i)(a), \ - (__v32qi)_mm256_setzero_si256(), \ - ((char)(imm)&0xF0) ? 32 : (char)(imm) + ((char)(imm)>0xF ? 16 : 0), \ - ((char)(imm)&0xF0) ? 33 : (char)(imm) + ((char)(imm)>0xE ? 17 : 1), \ - ((char)(imm)&0xF0) ? 34 : (char)(imm) + ((char)(imm)>0xD ? 18 : 2), \ - ((char)(imm)&0xF0) ? 35 : (char)(imm) + ((char)(imm)>0xC ? 19 : 3), \ - ((char)(imm)&0xF0) ? 36 : (char)(imm) + ((char)(imm)>0xB ? 20 : 4), \ - ((char)(imm)&0xF0) ? 37 : (char)(imm) + ((char)(imm)>0xA ? 21 : 5), \ - ((char)(imm)&0xF0) ? 38 : (char)(imm) + ((char)(imm)>0x9 ? 22 : 6), \ - ((char)(imm)&0xF0) ? 39 : (char)(imm) + ((char)(imm)>0x8 ? 23 : 7), \ - ((char)(imm)&0xF0) ? 40 : (char)(imm) + ((char)(imm)>0x7 ? 24 : 8), \ - ((char)(imm)&0xF0) ? 41 : (char)(imm) + ((char)(imm)>0x6 ? 25 : 9), \ - ((char)(imm)&0xF0) ? 42 : (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \ - ((char)(imm)&0xF0) ? 43 : (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \ - ((char)(imm)&0xF0) ? 44 : (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \ - ((char)(imm)&0xF0) ? 45 : (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \ - ((char)(imm)&0xF0) ? 46 : (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \ - ((char)(imm)&0xF0) ? 47 : (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \ - ((char)(imm)&0xF0) ? 48 : (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \ - ((char)(imm)&0xF0) ? 49 : (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \ - ((char)(imm)&0xF0) ? 50 : (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \ - ((char)(imm)&0xF0) ? 51 : (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \ - ((char)(imm)&0xF0) ? 52 : (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \ - ((char)(imm)&0xF0) ? 53 : (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \ - ((char)(imm)&0xF0) ? 54 : (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \ - ((char)(imm)&0xF0) ? 55 : (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \ - ((char)(imm)&0xF0) ? 56 : (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \ - ((char)(imm)&0xF0) ? 57 : (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \ - ((char)(imm)&0xF0) ? 58 : (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \ - ((char)(imm)&0xF0) ? 59 : (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \ - ((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \ - ((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \ - ((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \ - ((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); }) - -#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count)) - -static __inline__ __m256i __DEFAULT_FN_ATTRS +#define _mm256_srli_si256(a, imm) \ + (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)) + +#define _mm256_bsrli_epi128(a, imm) \ + (__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi16(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32(__m256i __a, int __count) { return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi32(__m256i __a, __m128i __count) { return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64(__m256i __a, int __count) { return __builtin_ia32_psrlqi256((__v4di)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srl_epi64(__m256i __a, __m128i __count) { return __builtin_ia32_psrlq256((__v4di)__a, __count); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a, __m256i __b) { return (__m256i)((__v32qu)__a - (__v32qu)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a - (__v16hu)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a, __m256i __b) { return (__m256i)((__v8su)__a - (__v8su)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a - (__v4du)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu8(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_subs_epu16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpackhi_epi64(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi8(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_unpacklo_epi64(__m256i __a, __m256i __b) { return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_xor_si256(__m256i __a, __m256i __b) { return (__m256i)((__v4du)__a ^ (__v4du)__b); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_stream_load_si256(__m256i const *__V) { typedef __v4di __v4di_aligned __attribute__((aligned(32))); return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_broadcastss_ps(__m128 __X) { return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_broadcastsd_pd(__m128d __a) { return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcastss_ps(__m128 __X) { return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_broadcastsd_pd(__m128d __X) { return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastsi128_si256(__m128i __X) { return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); } -#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1), \ - (__v4si)(__m128i)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)); }) - -#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)); }) - -static __inline__ __m256i __DEFAULT_FN_ATTRS +#define _mm_blend_epi32(V1, V2, M) \ + (__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M)) + +#define _mm256_blend_epi32(V1, V2, M) \ + (__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (int)(M)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastb_epi8(__m128i __X) { return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastw_epi16(__m128i __X) { return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastd_epi32(__m128i __X) { return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastq_epi64(__m128i __X) { return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastb_epi8(__m128i __X) { return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastw_epi16(__m128i __X) { return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastd_epi32(__m128i __X) { return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastq_epi64(__m128i __X) { return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); } -#define _mm256_permute4x64_pd(V, M) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \ - (__v4df)_mm256_undefined_pd(), \ - ((M) >> 0) & 0x3, \ - ((M) >> 2) & 0x3, \ - ((M) >> 4) & 0x3, \ - ((M) >> 6) & 0x3); }) +#define _mm256_permute4x64_pd(V, M) \ + (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)) -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) { return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); } -#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \ - (__v4di)_mm256_undefined_si256(), \ - ((M) >> 0) & 0x3, \ - ((M) >> 2) & 0x3, \ - ((M) >> 4) & 0x3, \ - ((M) >> 6) & 0x3); }) - -#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \ - (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); }) - -#define _mm256_extracti128_si256(V, M) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \ - (__v4di)_mm256_undefined_si256(), \ - (((M) & 1) ? 2 : 0), \ - (((M) & 1) ? 3 : 1) ); }) - -#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \ - (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ - (((M) & 1) ? 0 : 4), \ - (((M) & 1) ? 1 : 5), \ - (((M) & 1) ? 4 : 2), \ - (((M) & 1) ? 5 : 3) ); }) - -static __inline__ __m256i __DEFAULT_FN_ATTRS +#define _mm256_permute4x64_epi64(V, M) \ + (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)) + +#define _mm256_permute2x128_si256(V1, V2, M) \ + (__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)) + +#define _mm256_extracti128_si256(V, M) \ + (__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)) + +#define _mm256_inserti128_si256(V1, V2, M) \ + (__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ + (__v2di)(__m128i)(V2), (int)(M)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi32(int const *__X, __m256i __M) { return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskload_epi64(long long const *__X, __m256i __M) { return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi32(int const *__X, __m128i __M) { return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskload_epi64(long long const *__X, __m128i __M) { return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) { __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) { __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) { __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) { __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi32(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi32(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); } -#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i32gather_pd(a, m, i, mask, s) \ (__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ (double const *)(m), \ (__v4si)(__m128i)(i), \ - (__v2df)(__m128d)(mask), (s)); }) + (__v2df)(__m128d)(mask), (s)) -#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ (__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ (double const *)(m), \ (__v4si)(__m128i)(i), \ - (__v4df)(__m256d)(mask), (s)); }) + (__v4df)(__m256d)(mask), (s)) -#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i64gather_pd(a, m, i, mask, s) \ (__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ (double const *)(m), \ (__v2di)(__m128i)(i), \ - (__v2df)(__m128d)(mask), (s)); }) + (__v2df)(__m128d)(mask), (s)) -#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ (__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ (double const *)(m), \ (__v4di)(__m256i)(i), \ - (__v4df)(__m256d)(mask), (s)); }) + (__v4df)(__m256d)(mask), (s)) -#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i32gather_ps(a, m, i, mask, s) \ (__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ (float const *)(m), \ (__v4si)(__m128i)(i), \ - (__v4sf)(__m128)(mask), (s)); }) + (__v4sf)(__m128)(mask), (s)) -#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ (__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ (float const *)(m), \ (__v8si)(__m256i)(i), \ - (__v8sf)(__m256)(mask), (s)); }) + (__v8sf)(__m256)(mask), (s)) -#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i64gather_ps(a, m, i, mask, s) \ (__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ (float const *)(m), \ (__v2di)(__m128i)(i), \ - (__v4sf)(__m128)(mask), (s)); }) + (__v4sf)(__m128)(mask), (s)) -#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ (__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ (float const *)(m), \ (__v4di)(__m256i)(i), \ - (__v4sf)(__m128)(mask), (s)); }) + (__v4sf)(__m128)(mask), (s)) -#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ (__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ (int const *)(m), \ (__v4si)(__m128i)(i), \ - (__v4si)(__m128i)(mask), (s)); }) + (__v4si)(__m128i)(mask), (s)) -#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ (__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ (int const *)(m), \ (__v8si)(__m256i)(i), \ - (__v8si)(__m256i)(mask), (s)); }) + (__v8si)(__m256i)(mask), (s)) -#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ (__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ (int const *)(m), \ (__v2di)(__m128i)(i), \ - (__v4si)(__m128i)(mask), (s)); }) + (__v4si)(__m128i)(mask), (s)) -#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ (__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ (int const *)(m), \ (__v4di)(__m256i)(i), \ - (__v4si)(__m128i)(mask), (s)); }) + (__v4si)(__m128i)(mask), (s)) -#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ (__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ (long long const *)(m), \ (__v4si)(__m128i)(i), \ - (__v2di)(__m128i)(mask), (s)); }) + (__v2di)(__m128i)(mask), (s)) -#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ (__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ (long long const *)(m), \ (__v4si)(__m128i)(i), \ - (__v4di)(__m256i)(mask), (s)); }) + (__v4di)(__m256i)(mask), (s)) -#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \ +#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ (__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ (long long const *)(m), \ (__v2di)(__m128i)(i), \ - (__v2di)(__m128i)(mask), (s)); }) + (__v2di)(__m128i)(mask), (s)) -#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \ +#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ (__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ (long long const *)(m), \ (__v4di)(__m256i)(i), \ - (__v4di)(__m256i)(mask), (s)); }) + (__v4di)(__m256i)(mask), (s)) -#define _mm_i32gather_pd(m, i, s) __extension__ ({ \ +#define _mm_i32gather_pd(m, i, s) \ (__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ (double const *)(m), \ (__v4si)(__m128i)(i), \ (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ _mm_setzero_pd()), \ - (s)); }) + (s)) -#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \ +#define _mm256_i32gather_pd(m, i, s) \ (__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ (double const *)(m), \ (__v4si)(__m128i)(i), \ (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ _mm256_setzero_pd(), \ _CMP_EQ_OQ), \ - (s)); }) + (s)) -#define _mm_i64gather_pd(m, i, s) __extension__ ({ \ +#define _mm_i64gather_pd(m, i, s) \ (__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ (double const *)(m), \ (__v2di)(__m128i)(i), \ (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ _mm_setzero_pd()), \ - (s)); }) + (s)) -#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \ +#define _mm256_i64gather_pd(m, i, s) \ (__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ (double const *)(m), \ (__v4di)(__m256i)(i), \ (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ _mm256_setzero_pd(), \ _CMP_EQ_OQ), \ - (s)); }) + (s)) -#define _mm_i32gather_ps(m, i, s) __extension__ ({ \ +#define _mm_i32gather_ps(m, i, s) \ (__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ (float const *)(m), \ (__v4si)(__m128i)(i), \ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ _mm_setzero_ps()), \ - (s)); }) + (s)) -#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \ +#define _mm256_i32gather_ps(m, i, s) \ (__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ (float const *)(m), \ (__v8si)(__m256i)(i), \ (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ _mm256_setzero_ps(), \ _CMP_EQ_OQ), \ - (s)); }) + (s)) -#define _mm_i64gather_ps(m, i, s) __extension__ ({ \ +#define _mm_i64gather_ps(m, i, s) \ (__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ (float const *)(m), \ (__v2di)(__m128i)(i), \ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ _mm_setzero_ps()), \ - (s)); }) + (s)) -#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \ +#define _mm256_i64gather_ps(m, i, s) \ (__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ (float const *)(m), \ (__v4di)(__m256i)(i), \ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ _mm_setzero_ps()), \ - (s)); }) + (s)) -#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \ +#define _mm_i32gather_epi32(m, i, s) \ (__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ (int const *)(m), (__v4si)(__m128i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s)); }) + (__v4si)_mm_set1_epi32(-1), (s)) -#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \ +#define _mm256_i32gather_epi32(m, i, s) \ (__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ (int const *)(m), (__v8si)(__m256i)(i), \ - (__v8si)_mm256_set1_epi32(-1), (s)); }) + (__v8si)_mm256_set1_epi32(-1), (s)) -#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \ +#define _mm_i64gather_epi32(m, i, s) \ (__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ (int const *)(m), (__v2di)(__m128i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s)); }) + (__v4si)_mm_set1_epi32(-1), (s)) -#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \ +#define _mm256_i64gather_epi32(m, i, s) \ (__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ (int const *)(m), (__v4di)(__m256i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s)); }) + (__v4si)_mm_set1_epi32(-1), (s)) -#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \ +#define _mm_i32gather_epi64(m, i, s) \ (__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ (long long const *)(m), \ (__v4si)(__m128i)(i), \ - (__v2di)_mm_set1_epi64x(-1), (s)); }) + (__v2di)_mm_set1_epi64x(-1), (s)) -#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \ +#define _mm256_i32gather_epi64(m, i, s) \ (__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ (long long const *)(m), \ (__v4si)(__m128i)(i), \ - (__v4di)_mm256_set1_epi64x(-1), (s)); }) + (__v4di)_mm256_set1_epi64x(-1), (s)) -#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \ +#define _mm_i64gather_epi64(m, i, s) \ (__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ (long long const *)(m), \ (__v2di)(__m128i)(i), \ - (__v2di)_mm_set1_epi64x(-1), (s)); }) + (__v2di)_mm_set1_epi64x(-1), (s)) -#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \ +#define _mm256_i64gather_epi64(m, i, s) \ (__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ (long long const *)(m), \ (__v4di)(__m256i)(i), \ - (__v4di)_mm256_set1_epi64x(-1), (s)); }) + (__v4di)_mm256_set1_epi64x(-1), (s)) -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128 #endif /* __AVX2INTRIN_H */ diff --git a/c_headers/avx512bitalgintrin.h b/c_headers/avx512bitalgintrin.h index 2dd1471d2f..56046f8c49 100644 --- a/c_headers/avx512bitalgintrin.h +++ b/c_headers/avx512bitalgintrin.h @@ -29,7 +29,7 @@ #define __AVX512BITALGINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi16(__m512i __A) @@ -48,7 +48,7 @@ _mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) { - return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_hi(), + return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(), __U, __B); } @@ -70,7 +70,7 @@ _mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) { - return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_qi(), + return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(), __U, __B); } diff --git a/c_headers/avx512bwintrin.h b/c_headers/avx512bwintrin.h index 3ff0e3aafd..fc46323749 100644 --- a/c_headers/avx512bwintrin.h +++ b/c_headers/avx512bwintrin.h @@ -32,69 +32,49 @@ typedef unsigned int __mmask32; typedef unsigned long long __mmask64; /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"))) - -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_setzero_qi(void) { - return (__m512i)(__v64qi){ 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_setzero_hi(void) { - return (__m512i)(__v32hi){ 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 }; -} +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512))) /* Integer compare */ -#define _mm512_cmp_epi8_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epi8_mask(a, b, p) \ (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1); }) + (__mmask64)-1) -#define _mm512_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \ (__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)); }) + (__mmask64)(m)) -#define _mm512_cmp_epu8_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epu8_mask(a, b, p) \ (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1); }) + (__mmask64)-1) -#define _mm512_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \ (__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m)); }) + (__mmask64)(m)) -#define _mm512_cmp_epi16_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epi16_mask(a, b, p) \ (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1); }) + (__mmask32)-1) -#define _mm512_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \ (__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)); }) + (__mmask32)(m)) -#define _mm512_cmp_epu16_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epu16_mask(a, b, p) \ (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1); }) + (__mmask32)-1) -#define _mm512_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \ (__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m)); }) + (__mmask32)(m)) #define _mm512_cmpeq_epi8_mask(A, B) \ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) @@ -212,7 +192,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_add_epi8(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -231,7 +211,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_sub_epi8(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -250,7 +230,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_add_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -269,7 +249,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sub_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -288,7 +268,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_mullo_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -310,49 +290,45 @@ _mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_abs_epi8 (__m512i __A) { - return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, - (__v64qi) __W, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_abs_epi8(__A), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_abs_epi8(__A), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_abs_epi16 (__m512i __A) { - return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_abs_epi16(__A), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_abs_epi16(__A), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -366,7 +342,7 @@ _mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_packs_epi32(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -396,7 +372,7 @@ _mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_packs_epi16(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -410,7 +386,7 @@ _mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, (__v32hi)_mm512_packus_epi32(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -440,7 +416,7 @@ _mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, (__v64qi)_mm512_packus_epi16(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -448,7 +424,7 @@ _mm512_adds_epi8 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) -1); } @@ -467,7 +443,7 @@ _mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) __U); } @@ -476,7 +452,7 @@ _mm512_adds_epi16 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) -1); } @@ -495,7 +471,7 @@ _mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) __U); } @@ -504,7 +480,7 @@ _mm512_adds_epu8 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) -1); } @@ -523,7 +499,7 @@ _mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) __U); } @@ -532,7 +508,7 @@ _mm512_adds_epu16 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) -1); } @@ -551,7 +527,7 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) __U); } @@ -579,7 +555,7 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_avg_epu8(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -606,231 +582,184 @@ _mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_avg_epu16(__A, __B), - (__v32hi) _mm512_setzero_hi()); + (__v32hi) _mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_max_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epi8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_max_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_max_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epu8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_max_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epu16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_min_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epi8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_min_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_min_epu8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) -1); + return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) +_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, - (__v64qi) __B, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epu8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_min_epu16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) +_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epu16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -852,7 +781,7 @@ _mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_shuffle_epi8(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -860,7 +789,7 @@ _mm512_subs_epi8 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) -1); } @@ -879,7 +808,7 @@ _mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) __U); } @@ -888,7 +817,7 @@ _mm512_subs_epi16 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) -1); } @@ -907,7 +836,7 @@ _mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) __U); } @@ -916,7 +845,7 @@ _mm512_subs_epu8 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) -1); } @@ -935,7 +864,7 @@ _mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, (__v64qi) __B, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), (__mmask64) __U); } @@ -944,7 +873,7 @@ _mm512_subs_epu16 (__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) -1); } @@ -963,182 +892,148 @@ _mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), (__mmask32) __U); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I, - __mmask32 __U, __m512i __B) +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { - return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, - (__v32hi) __I /* idx */ , - (__v32hi) __B, - (__mmask32) __U); + return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B) +_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */, - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) -1); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U, - __m512i __I, __m512i __B) +_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I /* idx */, - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)__I); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A, - __m512i __I, __m512i __B) +_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I - /* idx */ , - (__v32hi) __A, - (__v32hi) __B, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mulhrs_epi16 (__m512i __A, __m512i __B) +_mm512_mulhrs_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhrs_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhrs_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mulhi_epi16 (__m512i __A, __m512i __B) +_mm512_mulhi_epi16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, +_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mulhi_epu16 (__m512i __A, __m512i __B) +_mm512_mulhi_epu16(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); + return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) +_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) __W, - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) -1); +_mm512_maddubs_epi16(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X, - __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v32hi) __W, - (__mmask32) __U); +_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi)_mm512_maddubs_epi16(__X, __Y), + (__v32hi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, - (__v64qi) __Y, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __U); +_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi)_mm512_maddubs_epi16(__X, __Y), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_madd_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v16si) _mm512_setzero_si512(), - (__mmask16) -1); +_mm512_madd_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A, - __m512i __B) { - return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v16si) __W, - (__mmask16) __U); +_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_madd_epi16(__A, __B), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, - (__v32hi) __B, - (__v16si) _mm512_setzero_si512(), - (__mmask16) __U); +_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_madd_epi16(__A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m256i __DEFAULT_FN_ATTRS @@ -1186,7 +1081,7 @@ _mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) { static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvtepi16_epi8 (__m512i __A) { return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) _mm256_setzero_si256(), + (__v32qi) _mm256_undefined_si256(), (__mmask32) -1); } @@ -1254,7 +1149,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpackhi_epi8(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1281,7 +1176,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpackhi_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1316,7 +1211,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, (__v64qi)_mm512_unpacklo_epi8(__A, __B), - (__v64qi)_mm512_setzero_qi()); + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1343,7 +1238,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_unpacklo_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1367,7 +1262,7 @@ _mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_cvtepi8_epi16(__A), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1389,83 +1284,41 @@ _mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_cvtepu8_epi16(__A), - (__v32hi)_mm512_setzero_hi()); -} - - -#define _mm512_shufflehi_epi16(A, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ - (__v32hi)_mm512_undefined_epi32(), \ - 0, 1, 2, 3, \ - 4 + (((imm) >> 0) & 0x3), \ - 4 + (((imm) >> 2) & 0x3), \ - 4 + (((imm) >> 4) & 0x3), \ - 4 + (((imm) >> 6) & 0x3), \ - 8, 9, 10, 11, \ - 12 + (((imm) >> 0) & 0x3), \ - 12 + (((imm) >> 2) & 0x3), \ - 12 + (((imm) >> 4) & 0x3), \ - 12 + (((imm) >> 6) & 0x3), \ - 16, 17, 18, 19, \ - 20 + (((imm) >> 0) & 0x3), \ - 20 + (((imm) >> 2) & 0x3), \ - 20 + (((imm) >> 4) & 0x3), \ - 20 + (((imm) >> 6) & 0x3), \ - 24, 25, 26, 27, \ - 28 + (((imm) >> 0) & 0x3), \ - 28 + (((imm) >> 2) & 0x3), \ - 28 + (((imm) >> 4) & 0x3), \ - 28 + (((imm) >> 6) & 0x3)); }) - -#define _mm512_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ + (__v32hi)_mm512_setzero_si512()); +} + + +#define _mm512_shufflehi_epi16(A, imm) \ + (__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)) + +#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__v32hi)_mm512_shufflehi_epi16((A), \ (imm)), \ - (__v32hi)(__m512i)(W)); }) + (__v32hi)(__m512i)(W)) -#define _mm512_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ +#define _mm512_maskz_shufflehi_epi16(U, A, imm) \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__v32hi)_mm512_shufflehi_epi16((A), \ (imm)), \ - (__v32hi)_mm512_setzero_hi()); }) - -#define _mm512_shufflelo_epi16(A, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v32hi)(__m512i)(A), \ - (__v32hi)_mm512_undefined_epi32(), \ - 0 + (((imm) >> 0) & 0x3), \ - 0 + (((imm) >> 2) & 0x3), \ - 0 + (((imm) >> 4) & 0x3), \ - 0 + (((imm) >> 6) & 0x3), \ - 4, 5, 6, 7, \ - 8 + (((imm) >> 0) & 0x3), \ - 8 + (((imm) >> 2) & 0x3), \ - 8 + (((imm) >> 4) & 0x3), \ - 8 + (((imm) >> 6) & 0x3), \ - 12, 13, 14, 15, \ - 16 + (((imm) >> 0) & 0x3), \ - 16 + (((imm) >> 2) & 0x3), \ - 16 + (((imm) >> 4) & 0x3), \ - 16 + (((imm) >> 6) & 0x3), \ - 20, 21, 22, 23, \ - 24 + (((imm) >> 0) & 0x3), \ - 24 + (((imm) >> 2) & 0x3), \ - 24 + (((imm) >> 4) & 0x3), \ - 24 + (((imm) >> 6) & 0x3), \ - 28, 29, 30, 31); }) - - -#define _mm512_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ + (__v32hi)_mm512_setzero_si512()) + +#define _mm512_shufflelo_epi16(A, imm) \ + (__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)) + + +#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__v32hi)_mm512_shufflelo_epi16((A), \ (imm)), \ - (__v32hi)(__m512i)(W)); }) + (__v32hi)(__m512i)(W)) -#define _mm512_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ +#define _mm512_maskz_shufflelo_epi16(U, A, imm) \ (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ (__v32hi)_mm512_shufflelo_epi16((A), \ (imm)), \ - (__v32hi)_mm512_setzero_hi()); }) + (__v32hi)_mm512_setzero_si512()) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_sllv_epi16(__m512i __A, __m512i __B) @@ -1486,7 +1339,7 @@ _mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sllv_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1508,7 +1361,7 @@ _mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sll_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1530,77 +1383,11 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_slli_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); -} - -#define _mm512_bslli_epi128(a, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector( \ - (__v64qi)_mm512_setzero_si512(), \ - (__v64qi)(__m512i)(a), \ - ((char)(imm)&0xF0) ? 0 : ((char)(imm)>0x0 ? 16 : 64) - (char)(imm), \ - ((char)(imm)&0xF0) ? 1 : ((char)(imm)>0x1 ? 17 : 65) - (char)(imm), \ - ((char)(imm)&0xF0) ? 2 : ((char)(imm)>0x2 ? 18 : 66) - (char)(imm), \ - ((char)(imm)&0xF0) ? 3 : ((char)(imm)>0x3 ? 19 : 67) - (char)(imm), \ - ((char)(imm)&0xF0) ? 4 : ((char)(imm)>0x4 ? 20 : 68) - (char)(imm), \ - ((char)(imm)&0xF0) ? 5 : ((char)(imm)>0x5 ? 21 : 69) - (char)(imm), \ - ((char)(imm)&0xF0) ? 6 : ((char)(imm)>0x6 ? 22 : 70) - (char)(imm), \ - ((char)(imm)&0xF0) ? 7 : ((char)(imm)>0x7 ? 23 : 71) - (char)(imm), \ - ((char)(imm)&0xF0) ? 8 : ((char)(imm)>0x8 ? 24 : 72) - (char)(imm), \ - ((char)(imm)&0xF0) ? 9 : ((char)(imm)>0x9 ? 25 : 73) - (char)(imm), \ - ((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 74) - (char)(imm), \ - ((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 75) - (char)(imm), \ - ((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 76) - (char)(imm), \ - ((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 77) - (char)(imm), \ - ((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 78) - (char)(imm), \ - ((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 79) - (char)(imm), \ - ((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 80) - (char)(imm), \ - ((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 81) - (char)(imm), \ - ((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 82) - (char)(imm), \ - ((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 83) - (char)(imm), \ - ((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 84) - (char)(imm), \ - ((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 85) - (char)(imm), \ - ((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 86) - (char)(imm), \ - ((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 87) - (char)(imm), \ - ((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 88) - (char)(imm), \ - ((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 89) - (char)(imm), \ - ((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 90) - (char)(imm), \ - ((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 91) - (char)(imm), \ - ((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 92) - (char)(imm), \ - ((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 93) - (char)(imm), \ - ((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 94) - (char)(imm), \ - ((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 95) - (char)(imm), \ - ((char)(imm)&0xF0) ? 32 : ((char)(imm)>0x0 ? 48 : 96) - (char)(imm), \ - ((char)(imm)&0xF0) ? 33 : ((char)(imm)>0x1 ? 49 : 97) - (char)(imm), \ - ((char)(imm)&0xF0) ? 34 : ((char)(imm)>0x2 ? 50 : 98) - (char)(imm), \ - ((char)(imm)&0xF0) ? 35 : ((char)(imm)>0x3 ? 51 : 99) - (char)(imm), \ - ((char)(imm)&0xF0) ? 36 : ((char)(imm)>0x4 ? 52 : 100) - (char)(imm), \ - ((char)(imm)&0xF0) ? 37 : ((char)(imm)>0x5 ? 53 : 101) - (char)(imm), \ - ((char)(imm)&0xF0) ? 38 : ((char)(imm)>0x6 ? 54 : 102) - (char)(imm), \ - ((char)(imm)&0xF0) ? 39 : ((char)(imm)>0x7 ? 55 : 103) - (char)(imm), \ - ((char)(imm)&0xF0) ? 40 : ((char)(imm)>0x8 ? 56 : 104) - (char)(imm), \ - ((char)(imm)&0xF0) ? 41 : ((char)(imm)>0x9 ? 57 : 105) - (char)(imm), \ - ((char)(imm)&0xF0) ? 42 : ((char)(imm)>0xA ? 58 : 106) - (char)(imm), \ - ((char)(imm)&0xF0) ? 43 : ((char)(imm)>0xB ? 59 : 107) - (char)(imm), \ - ((char)(imm)&0xF0) ? 44 : ((char)(imm)>0xC ? 60 : 108) - (char)(imm), \ - ((char)(imm)&0xF0) ? 45 : ((char)(imm)>0xD ? 61 : 109) - (char)(imm), \ - ((char)(imm)&0xF0) ? 46 : ((char)(imm)>0xE ? 62 : 110) - (char)(imm), \ - ((char)(imm)&0xF0) ? 47 : ((char)(imm)>0xF ? 63 : 111) - (char)(imm), \ - ((char)(imm)&0xF0) ? 48 : ((char)(imm)>0x0 ? 64 : 112) - (char)(imm), \ - ((char)(imm)&0xF0) ? 49 : ((char)(imm)>0x1 ? 65 : 113) - (char)(imm), \ - ((char)(imm)&0xF0) ? 50 : ((char)(imm)>0x2 ? 66 : 114) - (char)(imm), \ - ((char)(imm)&0xF0) ? 51 : ((char)(imm)>0x3 ? 67 : 115) - (char)(imm), \ - ((char)(imm)&0xF0) ? 52 : ((char)(imm)>0x4 ? 68 : 116) - (char)(imm), \ - ((char)(imm)&0xF0) ? 53 : ((char)(imm)>0x5 ? 69 : 117) - (char)(imm), \ - ((char)(imm)&0xF0) ? 54 : ((char)(imm)>0x6 ? 70 : 118) - (char)(imm), \ - ((char)(imm)&0xF0) ? 55 : ((char)(imm)>0x7 ? 71 : 119) - (char)(imm), \ - ((char)(imm)&0xF0) ? 56 : ((char)(imm)>0x8 ? 72 : 120) - (char)(imm), \ - ((char)(imm)&0xF0) ? 57 : ((char)(imm)>0x9 ? 73 : 121) - (char)(imm), \ - ((char)(imm)&0xF0) ? 58 : ((char)(imm)>0xA ? 74 : 122) - (char)(imm), \ - ((char)(imm)&0xF0) ? 59 : ((char)(imm)>0xB ? 75 : 123) - (char)(imm), \ - ((char)(imm)&0xF0) ? 60 : ((char)(imm)>0xC ? 76 : 124) - (char)(imm), \ - ((char)(imm)&0xF0) ? 61 : ((char)(imm)>0xD ? 77 : 125) - (char)(imm), \ - ((char)(imm)&0xF0) ? 62 : ((char)(imm)>0xE ? 78 : 126) - (char)(imm), \ - ((char)(imm)&0xF0) ? 63 : ((char)(imm)>0xF ? 79 : 127) - (char)(imm)); }) + (__v32hi)_mm512_setzero_si512()); +} + +#define _mm512_bslli_epi128(a, imm) \ + (__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_srlv_epi16(__m512i __A, __m512i __B) @@ -1621,7 +1408,7 @@ _mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srlv_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1643,7 +1430,7 @@ _mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srav_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1665,7 +1452,7 @@ _mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_sra_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1687,7 +1474,7 @@ _mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srai_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1709,7 +1496,7 @@ _mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srl_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1731,77 +1518,11 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, (__v32hi)_mm512_srli_epi16(__A, __B), - (__v32hi)_mm512_setzero_hi()); -} - -#define _mm512_bsrli_epi128(a, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector( \ - (__v64qi)(__m512i)(a), \ - (__v64qi)_mm512_setzero_si512(), \ - ((char)(imm)&0xF0) ? 64 : (char)(imm) + ((char)(imm)>0xF ? 48 : 0), \ - ((char)(imm)&0xF0) ? 65 : (char)(imm) + ((char)(imm)>0xE ? 49 : 1), \ - ((char)(imm)&0xF0) ? 66 : (char)(imm) + ((char)(imm)>0xD ? 50 : 2), \ - ((char)(imm)&0xF0) ? 67 : (char)(imm) + ((char)(imm)>0xC ? 51 : 3), \ - ((char)(imm)&0xF0) ? 68 : (char)(imm) + ((char)(imm)>0xB ? 52 : 4), \ - ((char)(imm)&0xF0) ? 69 : (char)(imm) + ((char)(imm)>0xA ? 53 : 5), \ - ((char)(imm)&0xF0) ? 70 : (char)(imm) + ((char)(imm)>0x9 ? 54 : 6), \ - ((char)(imm)&0xF0) ? 71 : (char)(imm) + ((char)(imm)>0x8 ? 55 : 7), \ - ((char)(imm)&0xF0) ? 72 : (char)(imm) + ((char)(imm)>0x7 ? 56 : 8), \ - ((char)(imm)&0xF0) ? 73 : (char)(imm) + ((char)(imm)>0x6 ? 57 : 9), \ - ((char)(imm)&0xF0) ? 74 : (char)(imm) + ((char)(imm)>0x5 ? 58 : 10), \ - ((char)(imm)&0xF0) ? 75 : (char)(imm) + ((char)(imm)>0x4 ? 59 : 11), \ - ((char)(imm)&0xF0) ? 76 : (char)(imm) + ((char)(imm)>0x3 ? 60 : 12), \ - ((char)(imm)&0xF0) ? 77 : (char)(imm) + ((char)(imm)>0x2 ? 61 : 13), \ - ((char)(imm)&0xF0) ? 78 : (char)(imm) + ((char)(imm)>0x1 ? 62 : 14), \ - ((char)(imm)&0xF0) ? 79 : (char)(imm) + ((char)(imm)>0x0 ? 63 : 15), \ - ((char)(imm)&0xF0) ? 80 : (char)(imm) + ((char)(imm)>0xF ? 64 : 16), \ - ((char)(imm)&0xF0) ? 81 : (char)(imm) + ((char)(imm)>0xE ? 65 : 17), \ - ((char)(imm)&0xF0) ? 82 : (char)(imm) + ((char)(imm)>0xD ? 66 : 18), \ - ((char)(imm)&0xF0) ? 83 : (char)(imm) + ((char)(imm)>0xC ? 67 : 19), \ - ((char)(imm)&0xF0) ? 84 : (char)(imm) + ((char)(imm)>0xB ? 68 : 20), \ - ((char)(imm)&0xF0) ? 85 : (char)(imm) + ((char)(imm)>0xA ? 69 : 21), \ - ((char)(imm)&0xF0) ? 86 : (char)(imm) + ((char)(imm)>0x9 ? 70 : 22), \ - ((char)(imm)&0xF0) ? 87 : (char)(imm) + ((char)(imm)>0x8 ? 71 : 23), \ - ((char)(imm)&0xF0) ? 88 : (char)(imm) + ((char)(imm)>0x7 ? 72 : 24), \ - ((char)(imm)&0xF0) ? 89 : (char)(imm) + ((char)(imm)>0x6 ? 73 : 25), \ - ((char)(imm)&0xF0) ? 90 : (char)(imm) + ((char)(imm)>0x5 ? 74 : 26), \ - ((char)(imm)&0xF0) ? 91 : (char)(imm) + ((char)(imm)>0x4 ? 75 : 27), \ - ((char)(imm)&0xF0) ? 92 : (char)(imm) + ((char)(imm)>0x3 ? 76 : 28), \ - ((char)(imm)&0xF0) ? 93 : (char)(imm) + ((char)(imm)>0x2 ? 77 : 29), \ - ((char)(imm)&0xF0) ? 94 : (char)(imm) + ((char)(imm)>0x1 ? 78 : 30), \ - ((char)(imm)&0xF0) ? 95 : (char)(imm) + ((char)(imm)>0x0 ? 79 : 31), \ - ((char)(imm)&0xF0) ? 96 : (char)(imm) + ((char)(imm)>0xF ? 80 : 32), \ - ((char)(imm)&0xF0) ? 97 : (char)(imm) + ((char)(imm)>0xE ? 81 : 33), \ - ((char)(imm)&0xF0) ? 98 : (char)(imm) + ((char)(imm)>0xD ? 82 : 34), \ - ((char)(imm)&0xF0) ? 99 : (char)(imm) + ((char)(imm)>0xC ? 83 : 35), \ - ((char)(imm)&0xF0) ? 100 : (char)(imm) + ((char)(imm)>0xB ? 84 : 36), \ - ((char)(imm)&0xF0) ? 101 : (char)(imm) + ((char)(imm)>0xA ? 85 : 37), \ - ((char)(imm)&0xF0) ? 102 : (char)(imm) + ((char)(imm)>0x9 ? 86 : 38), \ - ((char)(imm)&0xF0) ? 103 : (char)(imm) + ((char)(imm)>0x8 ? 87 : 39), \ - ((char)(imm)&0xF0) ? 104 : (char)(imm) + ((char)(imm)>0x7 ? 88 : 40), \ - ((char)(imm)&0xF0) ? 105 : (char)(imm) + ((char)(imm)>0x6 ? 89 : 41), \ - ((char)(imm)&0xF0) ? 106 : (char)(imm) + ((char)(imm)>0x5 ? 90 : 42), \ - ((char)(imm)&0xF0) ? 107 : (char)(imm) + ((char)(imm)>0x4 ? 91 : 43), \ - ((char)(imm)&0xF0) ? 108 : (char)(imm) + ((char)(imm)>0x3 ? 92 : 44), \ - ((char)(imm)&0xF0) ? 109 : (char)(imm) + ((char)(imm)>0x2 ? 93 : 45), \ - ((char)(imm)&0xF0) ? 110 : (char)(imm) + ((char)(imm)>0x1 ? 94 : 46), \ - ((char)(imm)&0xF0) ? 111 : (char)(imm) + ((char)(imm)>0x0 ? 95 : 47), \ - ((char)(imm)&0xF0) ? 112 : (char)(imm) + ((char)(imm)>0xF ? 96 : 48), \ - ((char)(imm)&0xF0) ? 113 : (char)(imm) + ((char)(imm)>0xE ? 97 : 49), \ - ((char)(imm)&0xF0) ? 114 : (char)(imm) + ((char)(imm)>0xD ? 98 : 50), \ - ((char)(imm)&0xF0) ? 115 : (char)(imm) + ((char)(imm)>0xC ? 99 : 51), \ - ((char)(imm)&0xF0) ? 116 : (char)(imm) + ((char)(imm)>0xB ? 100 : 52), \ - ((char)(imm)&0xF0) ? 117 : (char)(imm) + ((char)(imm)>0xA ? 101 : 53), \ - ((char)(imm)&0xF0) ? 118 : (char)(imm) + ((char)(imm)>0x9 ? 102 : 54), \ - ((char)(imm)&0xF0) ? 119 : (char)(imm) + ((char)(imm)>0x8 ? 103 : 55), \ - ((char)(imm)&0xF0) ? 120 : (char)(imm) + ((char)(imm)>0x7 ? 104 : 56), \ - ((char)(imm)&0xF0) ? 121 : (char)(imm) + ((char)(imm)>0x6 ? 105 : 57), \ - ((char)(imm)&0xF0) ? 122 : (char)(imm) + ((char)(imm)>0x5 ? 106 : 58), \ - ((char)(imm)&0xF0) ? 123 : (char)(imm) + ((char)(imm)>0x4 ? 107 : 59), \ - ((char)(imm)&0xF0) ? 124 : (char)(imm) + ((char)(imm)>0x3 ? 108 : 60), \ - ((char)(imm)&0xF0) ? 125 : (char)(imm) + ((char)(imm)>0x2 ? 109 : 61), \ - ((char)(imm)&0xF0) ? 126 : (char)(imm) + ((char)(imm)>0x1 ? 110 : 62), \ - ((char)(imm)&0xF0) ? 127 : (char)(imm) + ((char)(imm)>0x0 ? 111 : 63)); }) + (__v32hi)_mm512_setzero_si512()); +} + +#define _mm512_bsrli_epi128(a, imm) \ + (__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) @@ -1816,7 +1537,7 @@ _mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, (__v32hi) __A, - (__v32hi) _mm512_setzero_hi ()); + (__v32hi) _mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1832,7 +1553,7 @@ _mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, (__v64qi) __A, - (__v64qi) _mm512_setzero_hi ()); + (__v64qi) _mm512_setzero_si512 ()); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1854,13 +1575,15 @@ _mm512_maskz_set1_epi8 (__mmask64 __M, char __A) static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_kunpackd (__mmask64 __A, __mmask64 __B) { - return (__mmask64) (( __A & 0xFFFFFFFF) | ( __B << 32)); + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_kunpackw (__mmask32 __A, __mmask32 __B) { -return (__mmask32) (( __A & 0xFFFF) | ( __B << 16)); + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -1876,7 +1599,7 @@ _mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquhi512_mask ((__v32hi *) __P, (__v32hi) - _mm512_setzero_hi (), + _mm512_setzero_si512 (), (__mmask32) __U); } @@ -1893,7 +1616,7 @@ _mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddquqi512_mask ((__v64qi *) __P, (__v64qi) - _mm512_setzero_hi (), + _mm512_setzero_si512 (), (__mmask64) __U); } static __inline__ void __DEFAULT_FN_ATTRS @@ -1916,55 +1639,55 @@ static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_test_epi8_mask (__m512i __A, __m512i __B) { return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_test_epi16_mask (__m512i __A, __m512i __B) { return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_testn_epi8_mask (__m512i __A, __m512i __B) { - return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_qi()); + return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS _mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_testn_epi16_mask (__m512i __A, __m512i __B) { return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS _mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_qi()); + _mm512_setzero_si512()); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS @@ -1994,8 +1717,7 @@ _mm512_movm_epi16 (__mmask32 __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcastb_epi8 (__m128i __A) { - return (__m512i)__builtin_shufflevector((__v16qi) __A, - (__v16qi)_mm_undefined_si128(), + return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -2037,8 +1759,7 @@ _mm512_maskz_set1_epi16 (__mmask32 __M, short __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcastw_epi16 (__m128i __A) { - return (__m512i)__builtin_shufflevector((__v8hi) __A, - (__v8hi)_mm_undefined_si128(), + return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } @@ -2062,67 +1783,54 @@ _mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_permutexvar_epi16 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, - (__v32hi) __A, - (__v32hi) _mm512_undefined_epi32 (), - (__mmask32) -1); + return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, - (__v32hi) __A, - (__v32hi) _mm512_setzero_hi(), - (__mmask32) __M); + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_permutexvar_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, - (__v32hi) __A, - (__v32hi) __W, - (__mmask32) __M); -} - -#define _mm512_alignr_epi8(A, B, N) __extension__ ({\ - (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(N), \ - (__v64qi)_mm512_undefined_pd(), \ - (__mmask64)-1); }) - -#define _mm512_mask_alignr_epi8(W, U, A, B, N) __extension__({\ - (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(N), \ - (__v64qi)(__m512i)(W), \ - (__mmask64)(U)); }) - -#define _mm512_maskz_alignr_epi8(U, A, B, N) __extension__({\ - (__m512i)__builtin_ia32_palignr512_mask((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(N), \ - (__v64qi)_mm512_setzero_si512(), \ - (__mmask64)(U)); }) - -#define _mm512_dbsad_epu8(A, B, imm) __extension__ ({\ - (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(imm), \ - (__v32hi)_mm512_undefined_epi32(), \ - (__mmask32)-1); }) - -#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) ({\ - (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(imm), \ - (__v32hi)(__m512i)(W), \ - (__mmask32)(U)); }) - -#define _mm512_maskz_dbsad_epu8(U, A, B, imm) ({\ - (__m512i)__builtin_ia32_dbpsadbw512_mask((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(imm), \ - (__v32hi)_mm512_setzero_hi(), \ - (__mmask32)(U)); }) + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_permutexvar_epi16(__A, __B), + (__v32hi)__W); +} + +#define _mm512_alignr_epi8(A, B, N) \ + (__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(N)) + +#define _mm512_mask_alignr_epi8(W, U, A, B, N) \ + (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ + (__v64qi)(__m512i)(W)) + +#define _mm512_maskz_alignr_epi8(U, A, B, N) \ + (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ + (__v64qi)(__m512i)_mm512_setzero_si512()) + +#define _mm512_dbsad_epu8(A, B, imm) \ + (__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(imm)) + +#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \ + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ + (__v32hi)(__m512i)(W)) + +#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \ + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ + (__v32hi)_mm512_setzero_si512()) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_sad_epu8 (__m512i __A, __m512i __B) diff --git a/c_headers/avx512cdintrin.h b/c_headers/avx512cdintrin.h index ec7e0cd443..e63902743c 100644 --- a/c_headers/avx512cdintrin.h +++ b/c_headers/avx512cdintrin.h @@ -29,7 +29,7 @@ #define __AVX512CDINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_conflict_epi64 (__m512i __A) @@ -82,49 +82,45 @@ _mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_lzcnt_epi32 (__m512i __A) { - return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_lzcnt_epi32(__A), + (__v16si)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_lzcnt_epi32(__A), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_lzcnt_epi64 (__m512i __A) { - return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_lzcnt_epi64(__A), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_lzcnt_epi64(__A), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512dqintrin.h b/c_headers/avx512dqintrin.h index 2c431d9740..8a00b3afa9 100644 --- a/c_headers/avx512dqintrin.h +++ b/c_headers/avx512dqintrin.h @@ -29,7 +29,7 @@ #define __AVX512DQINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mullo_epi64 (__m512i __A, __m512i __B) { @@ -226,20 +226,20 @@ _mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundpd_epi64(A, R) __extension__ ({ \ +#define _mm512_cvt_roundpd_epi64(A, R) \ (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \ (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \ (__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtpd_epu64 (__m512d __A) { @@ -265,20 +265,20 @@ _mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundpd_epu64(A, R) __extension__ ({ \ +#define _mm512_cvt_roundpd_epu64(A, R) \ (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \ (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \ (__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtps_epi64 (__m256 __A) { @@ -304,20 +304,20 @@ _mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundps_epi64(A, R) __extension__ ({ \ +#define _mm512_cvt_roundps_epi64(A, R) \ (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \ (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundps_epi64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \ (__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvtps_epu64 (__m256 __A) { @@ -343,60 +343,55 @@ _mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundps_epu64(A, R) __extension__ ({ \ +#define _mm512_cvt_roundps_epu64(A, R) \ (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \ (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundps_epu64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \ (__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_cvtepi64_pd (__m512i __A) { - return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, - (__v8df) _mm512_setzero_pd(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_convertvector((__v8di)__A, __v8df); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepi64_pd(__A), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, - (__v8df) _mm512_setzero_pd(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepi64_pd(__A), + (__v8df)_mm512_setzero_pd()); } -#define _mm512_cvt_roundepi64_pd(A, R) __extension__ ({ \ +#define _mm512_cvt_roundepi64_pd(A, R) \ (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \ (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \ (__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m256 __DEFAULT_FN_ATTRS _mm512_cvtepi64_ps (__m512i __A) { @@ -422,20 +417,20 @@ _mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundepi64_ps(A, R) __extension__ ({ \ +#define _mm512_cvt_roundepi64_ps(A, R) \ (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \ (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \ (__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -462,20 +457,20 @@ _mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvtt_roundpd_epi64(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundpd_epi64(A, R) \ (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \ (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \ (__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttpd_epu64 (__m512d __A) { @@ -501,20 +496,20 @@ _mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvtt_roundpd_epu64(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundpd_epu64(A, R) \ (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \ (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \ (__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttps_epi64 (__m256 __A) { @@ -540,20 +535,20 @@ _mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvtt_roundps_epi64(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundps_epi64(A, R) \ (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \ (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \ (__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttps_epu64 (__m256 __A) { @@ -579,60 +574,55 @@ _mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvtt_roundps_epu64(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundps_epu64(A, R) \ (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \ (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \ (__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_cvtepu64_pd (__m512i __A) { - return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, - (__v8df) _mm512_setzero_pd(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_convertvector((__v8du)__A, __v8df); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepu64_pd(__A), + (__v8df)__W); } static __inline__ __m512d __DEFAULT_FN_ATTRS _mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { - return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, - (__v8df) _mm512_setzero_pd(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepu64_pd(__A), + (__v8df)_mm512_setzero_pd()); } -#define _mm512_cvt_roundepu64_pd(A, R) __extension__ ({ \ +#define _mm512_cvt_roundepu64_pd(A, R) \ (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \ (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \ (__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline__ __m256 __DEFAULT_FN_ATTRS @@ -659,292 +649,292 @@ _mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundepu64_ps(A, R) __extension__ ({ \ +#define _mm512_cvt_roundepu64_ps(A, R) \ (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \ (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \ (__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_range_pd(A, B, C) __extension__ ({ \ +#define _mm512_range_pd(A, B, C) \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(C), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_range_pd(W, U, A, B, C) __extension__ ({ \ +#define _mm512_mask_range_pd(W, U, A, B, C) \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(W), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_range_pd(U, A, B, C) __extension__ ({ \ +#define _mm512_maskz_range_pd(U, A, B, C) \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(C), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_range_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_range_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(C), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_range_round_pd(W, U, A, B, C, R) __extension__ ({ \ +#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(C), \ (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_range_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_range_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(C), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_range_ps(A, B, C) __extension__ ({ \ +#define _mm512_range_ps(A, B, C) \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(C), \ (__v16sf)_mm512_setzero_ps(), \ (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_range_ps(W, U, A, B, C) __extension__ ({ \ +#define _mm512_mask_range_ps(W, U, A, B, C) \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(W), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_range_ps(U, A, B, C) __extension__ ({ \ +#define _mm512_maskz_range_ps(U, A, B, C) \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(C), \ (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_range_round_ps(A, B, C, R) __extension__ ({ \ +#define _mm512_range_round_ps(A, B, C, R) \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(C), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_range_round_ps(W, U, A, B, C, R) __extension__ ({ \ +#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(C), \ (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_range_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_range_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(C), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm_range_round_ss(A, B, C, R) __extension__ ({ \ +#define _mm_range_round_ss(A, B, C, R) \ (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8) -1, (int)(C),\ - (int)(R)); }) + (int)(R)) #define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION) -#define _mm_mask_range_round_ss(W, U, A, B, C, R) __extension__ ({ \ +#define _mm_mask_range_round_ss(W, U, A, B, C, R) \ (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W),\ (__mmask8)(U), (int)(C),\ - (int)(R)); }) + (int)(R)) #define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_range_round_ss(U, A, B, C, R) __extension__ ({ \ +#define _mm_maskz_range_round_ss(U, A, B, C, R) \ (__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(C),\ - (int)(R)); }) + (int)(R)) #define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) -#define _mm_range_round_sd(A, B, C, R) __extension__ ({ \ +#define _mm_range_round_sd(A, B, C, R) \ (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8) -1, (int)(C),\ - (int)(R)); }) + (int)(R)) #define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION) -#define _mm_mask_range_round_sd(W, U, A, B, C, R) __extension__ ({ \ +#define _mm_mask_range_round_sd(W, U, A, B, C, R) \ (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W),\ (__mmask8)(U), (int)(C),\ - (int)(R)); }) + (int)(R)) #define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_range_round_sd(U, A, B, C, R) __extension__ ({ \ +#define _mm_maskz_range_round_sd(U, A, B, C, R) \ (__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(C),\ - (int)(R)); }) + (int)(R)) #define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) -#define _mm512_reduce_pd(A, B) __extension__ ({ \ +#define _mm512_reduce_pd(A, B) \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_reduce_pd(W, U, A, B) __extension__ ({ \ +#define _mm512_mask_reduce_pd(W, U, A, B) \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__v8df)(__m512d)(W), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_reduce_pd(U, A, B) __extension__ ({ \ +#define _mm512_maskz_reduce_pd(U, A, B) \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_reduce_ps(A, B) __extension__ ({ \ +#define _mm512_reduce_ps(A, B) \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__v16sf)_mm512_setzero_ps(), \ (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_reduce_ps(W, U, A, B) __extension__ ({ \ +#define _mm512_mask_reduce_ps(W, U, A, B) \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__v16sf)(__m512)(W), \ (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_reduce_ps(U, A, B) __extension__ ({ \ +#define _mm512_maskz_reduce_ps(U, A, B) \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_reduce_round_pd(A, B, R) __extension__ ({\ +#define _mm512_reduce_round_pd(A, B, R) \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_reduce_round_pd(W, U, A, B, R) __extension__ ({\ +#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_reduce_round_pd(U, A, B, R) __extension__ ({\ +#define _mm512_maskz_reduce_round_pd(U, A, B, R) \ (__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_reduce_round_ps(A, B, R) __extension__ ({\ +#define _mm512_reduce_round_ps(A, B, R) \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_reduce_round_ps(W, U, A, B, R) __extension__ ({\ +#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_reduce_round_ps(U, A, B, R) __extension__ ({\ +#define _mm512_maskz_reduce_round_ps(U, A, B, R) \ (__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm_reduce_ss(A, B, C) __extension__ ({ \ +#define _mm_reduce_ss(A, B, C) \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ - (int)(C), _MM_FROUND_CUR_DIRECTION); }) + (int)(C), _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_reduce_ss(W, U, A, B, C) __extension__ ({ \ +#define _mm_mask_reduce_ss(W, U, A, B, C) \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(C), _MM_FROUND_CUR_DIRECTION); }) + (int)(C), _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_reduce_ss(U, A, B, C) __extension__ ({ \ +#define _mm_maskz_reduce_ss(U, A, B, C) \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(C), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_reduce_round_ss(A, B, C, R) __extension__ ({ \ +#define _mm_reduce_round_ss(A, B, C, R) \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ - (int)(C), (int)(R)); }) + (int)(C), (int)(R)) -#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) __extension__ ({ \ +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(C), (int)(R)); }) + (int)(C), (int)(R)) -#define _mm_maskz_reduce_round_ss(U, A, B, C, R) __extension__ ({ \ +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ (__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C), (int)(R)); }) + (__mmask8)(U), (int)(C), (int)(R)) -#define _mm_reduce_sd(A, B, C) __extension__ ({ \ +#define _mm_reduce_sd(A, B, C) \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)-1, (int)(C), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_reduce_sd(W, U, A, B, C) __extension__ ({ \ +#define _mm_mask_reduce_sd(W, U, A, B, C) \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), (__mmask8)(U), \ - (int)(C), _MM_FROUND_CUR_DIRECTION); }) + (int)(C), _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_reduce_sd(U, A, B, C) __extension__ ({ \ +#define _mm_maskz_reduce_sd(U, A, B, C) \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(C), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_reduce_round_sd(A, B, C, R) __extension__ ({ \ +#define _mm_reduce_round_sd(A, B, C, R) \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(C), (int)(R)); }) + (__mmask8)-1, (int)(C), (int)(R)) -#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) __extension__ ({ \ +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), (__mmask8)(U), \ - (int)(C), (int)(R)); }) + (int)(C), (int)(R)) -#define _mm_maskz_reduce_round_sd(U, A, B, C, R) __extension__ ({ \ +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ (__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C), (int)(R)); }) - + (__mmask8)(U), (int)(C), (int)(R)) + static __inline__ __mmask16 __DEFAULT_FN_ATTRS _mm512_movepi32_mask (__m512i __A) { @@ -973,8 +963,7 @@ _mm512_movepi64_mask (__m512i __A) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_broadcast_f32x2 (__m128 __A) { - return (__m512)__builtin_shufflevector((__v4sf)__A, - (__v4sf)_mm_undefined_ps(), + return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); } @@ -1006,7 +995,7 @@ _mm512_broadcast_f32x8(__m256 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask8)__M, + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x8(__A), (__v16sf)__O); } @@ -1014,7 +1003,7 @@ _mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) static __inline__ __m512 __DEFAULT_FN_ATTRS _mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) { - return (__m512)__builtin_ia32_selectps_512((__mmask8)__M, + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, (__v16sf)_mm512_broadcast_f32x8(__A), (__v16sf)_mm512_setzero_ps()); } @@ -1045,8 +1034,7 @@ _mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_broadcast_i32x2 (__m128i __A) { - return (__m512i)__builtin_shufflevector((__v4si)__A, - (__v4si)_mm_undefined_si128(), + return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); } @@ -1078,7 +1066,7 @@ _mm512_broadcast_i32x8(__m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M, + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x8(__A), (__v16si)__O); } @@ -1086,7 +1074,7 @@ _mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) { - return (__m512i)__builtin_ia32_selectd_512((__mmask8)__M, + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, (__v16si)_mm512_broadcast_i32x8(__A), (__v16si)_mm512_setzero_si512()); } @@ -1114,217 +1102,159 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) (__v8di)_mm512_setzero_si512()); } -#define _mm512_extractf32x8_ps(A, imm) __extension__ ({ \ - (__m256)__builtin_shufflevector((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - ((imm) & 1) ? 8 : 0, \ - ((imm) & 1) ? 9 : 1, \ - ((imm) & 1) ? 10 : 2, \ - ((imm) & 1) ? 11 : 3, \ - ((imm) & 1) ? 12 : 4, \ - ((imm) & 1) ? 13 : 5, \ - ((imm) & 1) ? 14 : 6, \ - ((imm) & 1) ? 15 : 7); }) - -#define _mm512_mask_extractf32x8_ps(W, U, A, imm) __extension__ ({ \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \ - (__v8sf)(W)); }) - -#define _mm512_maskz_extractf32x8_ps(U, A, imm) __extension__ ({ \ - (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm512_extractf32x8_ps((A), (imm)), \ - (__v8sf)_mm256_setzero_ps()); }) - -#define _mm512_extractf64x2_pd(A, imm) __extension__ ({ \ - (__m128d)__builtin_shufflevector((__v8df)(__m512d)(A), \ - (__v8df)_mm512_undefined_pd(), \ - 0 + ((imm) & 0x3) * 2, \ - 1 + ((imm) & 0x3) * 2); }) - -#define _mm512_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm512_extractf64x2_pd((A), (imm)), \ - (__v2df)(W)); }) - -#define _mm512_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm512_extractf64x2_pd((A), (imm)), \ - (__v2df)_mm_setzero_pd()); }) - -#define _mm512_extracti32x8_epi32(A, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v16si)(__m512i)(A), \ - (__v16si)_mm512_undefined_epi32(), \ - ((imm) & 1) ? 8 : 0, \ - ((imm) & 1) ? 9 : 1, \ - ((imm) & 1) ? 10 : 2, \ - ((imm) & 1) ? 11 : 3, \ - ((imm) & 1) ? 12 : 4, \ - ((imm) & 1) ? 13 : 5, \ - ((imm) & 1) ? 14 : 6, \ - ((imm) & 1) ? 15 : 7); }) - -#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \ - (__v8si)(W)); }) - -#define _mm512_maskz_extracti32x8_epi32(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm512_extracti32x8_epi32((A), (imm)), \ - (__v8si)_mm256_setzero_si256()); }) - -#define _mm512_extracti64x2_epi64(A, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8di)(__m512i)(A), \ - (__v8di)_mm512_undefined_epi32(), \ - 0 + ((imm) & 0x3) * 2, \ - 1 + ((imm) & 0x3) * 2); }) - -#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \ - (__v2di)(W)); }) - -#define _mm512_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm512_extracti64x2_epi64((A), (imm)), \ - (__v2di)_mm_setzero_di()); }) - -#define _mm512_insertf32x8(A, B, imm) __extension__ ({ \ - (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_castps256_ps512((__m256)(B)),\ - ((imm) & 0x1) ? 0 : 16, \ - ((imm) & 0x1) ? 1 : 17, \ - ((imm) & 0x1) ? 2 : 18, \ - ((imm) & 0x1) ? 3 : 19, \ - ((imm) & 0x1) ? 4 : 20, \ - ((imm) & 0x1) ? 5 : 21, \ - ((imm) & 0x1) ? 6 : 22, \ - ((imm) & 0x1) ? 7 : 23, \ - ((imm) & 0x1) ? 16 : 8, \ - ((imm) & 0x1) ? 17 : 9, \ - ((imm) & 0x1) ? 18 : 10, \ - ((imm) & 0x1) ? 19 : 11, \ - ((imm) & 0x1) ? 20 : 12, \ - ((imm) & 0x1) ? 21 : 13, \ - ((imm) & 0x1) ? 22 : 14, \ - ((imm) & 0x1) ? 23 : 15); }) - -#define _mm512_mask_insertf32x8(W, U, A, B, imm) __extension__ ({ \ +#define _mm512_extractf32x8_ps(A, imm) \ + (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)_mm256_undefined_ps(), \ + (__mmask8)-1) + +#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \ + (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extractf32x8_ps(U, A, imm) \ + (__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U)) + +#define _mm512_extractf64x2_pd(A, imm) \ + (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1) + +#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \ + (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extractf64x2_pd(U, A, imm) \ + (__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U)) + +#define _mm512_extracti32x8_epi32(A, imm) \ + (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1) + +#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \ + (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \ + (__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U)) + +#define _mm512_extracti64x2_epi64(A, imm) \ + (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)_mm_undefined_si128(), \ + (__mmask8)-1) + +#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \ + (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \ + (__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U)) + +#define _mm512_insertf32x8(A, B, imm) \ + (__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ + (__v8sf)(__m256)(B), (int)(imm)) + +#define _mm512_mask_insertf32x8(W, U, A, B, imm) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)(W)); }) + (__v16sf)(__m512)(W)) -#define _mm512_maskz_insertf32x8(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_insertf32x8(U, A, B, imm) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps()); }) - -#define _mm512_insertf64x2(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ - (__v8df)_mm512_castpd128_pd512((__m128d)(B)),\ - (((imm) & 0x3) == 0) ? 8 : 0, \ - (((imm) & 0x3) == 0) ? 9 : 1, \ - (((imm) & 0x3) == 1) ? 8 : 2, \ - (((imm) & 0x3) == 1) ? 9 : 3, \ - (((imm) & 0x3) == 2) ? 8 : 4, \ - (((imm) & 0x3) == 2) ? 9 : 5, \ - (((imm) & 0x3) == 3) ? 8 : 6, \ - (((imm) & 0x3) == 3) ? 9 : 7); }) - -#define _mm512_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ + (__v16sf)_mm512_setzero_ps()) + +#define _mm512_insertf64x2(A, B, imm) \ + (__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ + (__v2df)(__m128d)(B), (int)(imm)) + +#define _mm512_mask_insertf64x2(W, U, A, B, imm) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)(W)); }) + (__v8df)(__m512d)(W)) -#define _mm512_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_insertf64x2(U, A, B, imm) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd()); }) - -#define _mm512_inserti32x8(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ - (__v16si)_mm512_castsi256_si512((__m256i)(B)),\ - ((imm) & 0x1) ? 0 : 16, \ - ((imm) & 0x1) ? 1 : 17, \ - ((imm) & 0x1) ? 2 : 18, \ - ((imm) & 0x1) ? 3 : 19, \ - ((imm) & 0x1) ? 4 : 20, \ - ((imm) & 0x1) ? 5 : 21, \ - ((imm) & 0x1) ? 6 : 22, \ - ((imm) & 0x1) ? 7 : 23, \ - ((imm) & 0x1) ? 16 : 8, \ - ((imm) & 0x1) ? 17 : 9, \ - ((imm) & 0x1) ? 18 : 10, \ - ((imm) & 0x1) ? 19 : 11, \ - ((imm) & 0x1) ? 20 : 12, \ - ((imm) & 0x1) ? 21 : 13, \ - ((imm) & 0x1) ? 22 : 14, \ - ((imm) & 0x1) ? 23 : 15); }) - -#define _mm512_mask_inserti32x8(W, U, A, B, imm) __extension__ ({ \ + (__v8df)_mm512_setzero_pd()) + +#define _mm512_inserti32x8(A, B, imm) \ + (__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ + (__v8si)(__m256i)(B), (int)(imm)) + +#define _mm512_mask_inserti32x8(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)(W)); }) + (__v16si)(__m512i)(W)) -#define _mm512_maskz_inserti32x8(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_inserti32x8(U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()); }) - -#define _mm512_inserti64x2(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ - (__v8di)_mm512_castsi128_si512((__m128i)(B)),\ - (((imm) & 0x3) == 0) ? 8 : 0, \ - (((imm) & 0x3) == 0) ? 9 : 1, \ - (((imm) & 0x3) == 1) ? 8 : 2, \ - (((imm) & 0x3) == 1) ? 9 : 3, \ - (((imm) & 0x3) == 2) ? 8 : 4, \ - (((imm) & 0x3) == 2) ? 9 : 5, \ - (((imm) & 0x3) == 3) ? 8 : 6, \ - (((imm) & 0x3) == 3) ? 9 : 7); }) - -#define _mm512_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ + (__v16si)_mm512_setzero_si512()) + +#define _mm512_inserti64x2(A, B, imm) \ + (__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ + (__v2di)(__m128i)(B), (int)(imm)) + +#define _mm512_mask_inserti64x2(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)(W)); }) + (__v8di)(__m512i)(W)) -#define _mm512_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_inserti64x2(U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()); }) + (__v8di)_mm512_setzero_si512()) -#define _mm512_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ +#define _mm512_mask_fpclass_ps_mask(U, A, imm) \ (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ - (int)(imm), (__mmask16)(U)); }) + (int)(imm), (__mmask16)(U)) -#define _mm512_fpclass_ps_mask(A, imm) __extension__ ({ \ +#define _mm512_fpclass_ps_mask(A, imm) \ (__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ - (int)(imm), (__mmask16)-1); }) + (int)(imm), (__mmask16)-1) -#define _mm512_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ +#define _mm512_mask_fpclass_pd_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm512_fpclass_pd_mask(A, imm) __extension__ ({ \ +#define _mm512_fpclass_pd_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_fpclass_sd_mask(A, imm) __extension__ ({ \ +#define _mm_fpclass_sd_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_fpclass_sd_mask(U, A, imm) __extension__ ({ \ +#define _mm_mask_fpclass_sd_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_fpclass_ss_mask(A, imm) __extension__ ({ \ +#define _mm_fpclass_ss_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_fpclass_ss_mask(U, A, imm) __extension__ ({ \ +#define _mm_mask_fpclass_ss_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512erintrin.h b/c_headers/avx512erintrin.h index 8ff212c422..6348275c8d 100644 --- a/c_headers/avx512erintrin.h +++ b/c_headers/avx512erintrin.h @@ -27,21 +27,21 @@ #ifndef __AVX512ERINTRIN_H #define __AVX512ERINTRIN_H -// exp2a23 -#define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \ +/* exp2a23 */ +#define _mm512_exp2a23_round_pd(A, R) \ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \ +#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \ +#define _mm512_maskz_exp2a23_round_pd(M, A, R) \ (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm512_exp2a23_pd(A) \ _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) @@ -52,20 +52,20 @@ #define _mm512_maskz_exp2a23_pd(M, A) \ _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) -#define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \ +#define _mm512_exp2a23_round_ps(A, R) \ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \ +#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \ +#define _mm512_maskz_exp2a23_round_ps(M, A, R) \ (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R)); }) + (__mmask16)(M), (int)(R)) #define _mm512_exp2a23_ps(A) \ _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) @@ -76,21 +76,21 @@ #define _mm512_maskz_exp2a23_ps(M, A) \ _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) -// rsqrt28 -#define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \ +/* rsqrt28 */ +#define _mm512_rsqrt28_round_pd(A, R) \ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \ +#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \ +#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \ (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm512_rsqrt28_pd(A) \ _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) @@ -101,20 +101,20 @@ #define _mm512_maskz_rsqrt28_pd(M, A) \ _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) -#define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \ +#define _mm512_rsqrt28_round_ps(A, R) \ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \ +#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \ +#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \ (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R)); }) + (__mmask16)(M), (int)(R)) #define _mm512_rsqrt28_ps(A) \ _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) @@ -125,23 +125,23 @@ #define _mm512_maskz_rsqrt28_ps(M, A) \ _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) -#define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \ +#define _mm_rsqrt28_round_ss(A, B, R) \ (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \ +#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \ (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(S), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) -#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \ +#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \ (__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm_rsqrt28_ss(A, B) \ _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -152,23 +152,23 @@ #define _mm_maskz_rsqrt28_ss(M, A, B) \ _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) -#define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \ +#define _mm_rsqrt28_round_sd(A, B, R) \ (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \ +#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \ (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(S), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) -#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \ +#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \ (__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm_rsqrt28_sd(A, B) \ _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -179,21 +179,21 @@ #define _mm_maskz_rsqrt28_sd(M, A, B) \ _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) -// rcp28 -#define _mm512_rcp28_round_pd(A, R) __extension__ ({ \ +/* rcp28 */ +#define _mm512_rcp28_round_pd(A, R) \ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \ +#define _mm512_mask_rcp28_round_pd(S, M, A, R) \ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \ +#define _mm512_maskz_rcp28_round_pd(M, A, R) \ (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm512_rcp28_pd(A) \ _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) @@ -204,20 +204,20 @@ #define _mm512_maskz_rcp28_pd(M, A) \ _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) -#define _mm512_rcp28_round_ps(A, R) __extension__ ({ \ +#define _mm512_rcp28_round_ps(A, R) \ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \ +#define _mm512_mask_rcp28_round_ps(S, M, A, R) \ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \ +#define _mm512_maskz_rcp28_round_ps(M, A, R) \ (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R)); }) + (__mmask16)(M), (int)(R)) #define _mm512_rcp28_ps(A) \ _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) @@ -228,23 +228,23 @@ #define _mm512_maskz_rcp28_ps(M, A) \ _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) -#define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \ +#define _mm_rcp28_round_ss(A, B, R) \ (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \ +#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \ (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(S), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) -#define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \ +#define _mm_maskz_rcp28_round_ss(M, A, B, R) \ (__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm_rcp28_ss(A, B) \ _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -255,23 +255,23 @@ #define _mm_maskz_rcp28_ss(M, A, B) \ _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) -#define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \ +#define _mm_rcp28_round_sd(A, B, R) \ (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \ +#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \ (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(S), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) -#define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \ +#define _mm_maskz_rcp28_round_sd(M, A, B, R) \ (__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) #define _mm_rcp28_sd(A, B) \ _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) @@ -282,4 +282,4 @@ #define _mm_maskz_rcp28_sd(M, A, B) \ _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) -#endif // __AVX512ERINTRIN_H +#endif /* __AVX512ERINTRIN_H */ diff --git a/c_headers/avx512fintrin.h b/c_headers/avx512fintrin.h index d34f0b1327..8dd4a0a40e 100644 --- a/c_headers/avx512fintrin.h +++ b/c_headers/avx512fintrin.h @@ -173,51 +173,51 @@ typedef enum } _MM_MANTISSA_SIGN_ENUM; /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) +#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) /* Create vectors with repeated elements */ -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_setzero_si512(void) { - return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; } #define _mm512_setzero_epi32 _mm512_setzero_si512 -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_undefined_pd(void) { return (__m512d)__builtin_ia32_undef512(); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_undefined(void) { return (__m512)__builtin_ia32_undef512(); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_undefined_ps(void) { return (__m512)__builtin_ia32_undef512(); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_undefined_epi32(void) { return (__m512i)__builtin_ia32_undef512(); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcastd_epi32 (__m128i __A) { - return (__m512i)__builtin_shufflevector((__v4si) __A, - (__v4si)_mm_undefined_si128(), + return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512(__M, @@ -225,7 +225,7 @@ _mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) (__v16si) __O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512(__M, @@ -233,15 +233,14 @@ _mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) (__v16si) _mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcastq_epi64 (__m128i __A) { - return (__m512i)__builtin_shufflevector((__v2di) __A, - (__v2di) _mm_undefined_si128(), + return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512(__M, @@ -250,7 +249,7 @@ _mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512(__M, @@ -259,122 +258,122 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_setzero_ps(void) { - return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; } #define _mm512_setzero _mm512_setzero_ps -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_setzero_pd(void) { - return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; + return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_set1_ps(float __w) { - return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w }; + return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_set1_pd(double __w) { - return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; + return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi8(char __w) { - return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w }; + return __extension__ (__m512i)(__v64qi){ + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi16(short __w) { - return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w }; + return __extension__ (__m512i)(__v32hi){ + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w }; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi32(int __s) { - return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s, - __s, __s, __s, __s, __s, __s, __s, __s }; + return __extension__ (__m512i)(__v16si){ + __s, __s, __s, __s, __s, __s, __s, __s, + __s, __s, __s, __s, __s, __s, __s, __s }; } -static __inline __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_set1_epi32(__mmask16 __M, int __A) +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_set1_epi32(__mmask16 __M, int __A) { - return (__m512i)__builtin_ia32_selectd_512(__M, + return (__m512i)__builtin_ia32_selectd_512(__M, (__v16si)_mm512_set1_epi32(__A), (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set1_epi64(long long __d) { - return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; + return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; } -#ifdef __x86_64__ -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_set1_epi64(__mmask8 __M, long long __A) { return (__m512i)__builtin_ia32_selectq_512(__M, (__v8di)_mm512_set1_epi64(__A), (__v8di)_mm512_setzero_si512()); } -#endif -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_broadcastss_ps(__m128 __A) { - return (__m512)__builtin_shufflevector((__v4sf) __A, - (__v4sf)_mm_undefined_ps(), + return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set4_epi32 (int __A, int __B, int __C, int __D) { - return (__m512i)(__v16si) + return __extension__ (__m512i)(__v16si) { __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A }; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set4_epi64 (long long __A, long long __B, long long __C, long long __D) { - return (__m512i) (__v8di) + return __extension__ (__m512i) (__v8di) { __D, __C, __B, __A, __D, __C, __B, __A }; } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_set4_pd (double __A, double __B, double __C, double __D) { - return (__m512d) + return __extension__ (__m512d) { __D, __C, __B, __A, __D, __C, __B, __A }; } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_set4_ps (float __A, float __B, float __C, float __D) { - return (__m512) + return __extension__ (__m512) { __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A, __D, __C, __B, __A }; } @@ -391,138 +390,137 @@ _mm512_set4_ps (float __A, float __B, float __C, float __D) #define _mm512_setr4_ps(e0,e1,e2,e3) \ _mm512_set4_ps((e3),(e2),(e1),(e0)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_broadcastsd_pd(__m128d __A) { - return (__m512d)__builtin_shufflevector((__v2df) __A, - (__v2df) _mm_undefined_pd(), + return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, 0, 0, 0, 0, 0, 0, 0, 0); } /* Cast between vector types */ -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_castpd256_pd512(__m256d __a) { return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_castps256_ps512(__m256 __a) { return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1); } -static __inline __m128d __DEFAULT_FN_ATTRS +static __inline __m128d __DEFAULT_FN_ATTRS512 _mm512_castpd512_pd128(__m512d __a) { return __builtin_shufflevector(__a, __a, 0, 1); } -static __inline __m256d __DEFAULT_FN_ATTRS +static __inline __m256d __DEFAULT_FN_ATTRS512 _mm512_castpd512_pd256 (__m512d __A) { return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); } -static __inline __m128 __DEFAULT_FN_ATTRS +static __inline __m128 __DEFAULT_FN_ATTRS512 _mm512_castps512_ps128(__m512 __a) { return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); } -static __inline __m256 __DEFAULT_FN_ATTRS +static __inline __m256 __DEFAULT_FN_ATTRS512 _mm512_castps512_ps256 (__m512 __A) { return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_castpd_ps (__m512d __A) { return (__m512) (__A); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_castpd_si512 (__m512d __A) { return (__m512i) (__A); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castpd128_pd512 (__m128d __A) { return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_castps_pd (__m512 __A) { return (__m512d) (__A); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_castps_si512 (__m512 __A) { return (__m512i) (__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castps128_ps512 (__m128 __A) { return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_castsi128_si512 (__m128i __A) { return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_castsi256_si512 (__m256i __A) { return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_castsi512_ps (__m512i __A) { return (__m512) (__A); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_castsi512_pd (__m512i __A) { return (__m512d) (__A); } -static __inline __m128i __DEFAULT_FN_ATTRS +static __inline __m128i __DEFAULT_FN_ATTRS512 _mm512_castsi512_si128 (__m512i __A) { return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); } -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS512 _mm512_castsi512_si256 (__m512i __A) { return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_int2mask(int __a) { return (__mmask16)__a; } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask2int(__mmask16 __a) { return (int)__a; } -/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a +/// Constructs a 512-bit floating-point vector of [8 x double] from a /// 128-bit floating-point vector of [2 x double]. The lower 128 bits /// contain the value of the source vector. The upper 384 bits are set /// to zero. @@ -535,13 +533,13 @@ _mm512_mask2int(__mmask16 __a) /// A 128-bit vector of [2 x double]. /// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits /// contain the value of the parameter. The upper 384 bits are set to zero. -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_zextpd128_pd512(__m128d __a) { return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); } -/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a +/// Constructs a 512-bit floating-point vector of [8 x double] from a /// 256-bit floating-point vector of [4 x double]. The lower 256 bits /// contain the value of the source vector. The upper 256 bits are set /// to zero. @@ -554,13 +552,13 @@ _mm512_zextpd128_pd512(__m128d __a) /// A 256-bit vector of [4 x double]. /// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits /// contain the value of the parameter. The upper 256 bits are set to zero. -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_zextpd256_pd512(__m256d __a) { return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); } -/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a +/// Constructs a 512-bit floating-point vector of [16 x float] from a /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain /// the value of the source vector. The upper 384 bits are set to zero. /// @@ -572,13 +570,13 @@ _mm512_zextpd256_pd512(__m256d __a) /// A 128-bit vector of [4 x float]. /// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits /// contain the value of the parameter. The upper 384 bits are set to zero. -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_zextps128_ps512(__m128 __a) { return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); } -/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a +/// Constructs a 512-bit floating-point vector of [16 x float] from a /// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain /// the value of the source vector. The upper 256 bits are set to zero. /// @@ -590,13 +588,13 @@ _mm512_zextps128_ps512(__m128 __a) /// A 256-bit vector of [8 x float]. /// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits /// contain the value of the parameter. The upper 256 bits are set to zero. -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_zextps256_ps512(__m256 __a) { return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector. +/// Constructs a 512-bit integer vector from a 128-bit integer vector. /// The lower 128 bits contain the value of the source vector. The upper /// 384 bits are set to zero. /// @@ -608,13 +606,13 @@ _mm512_zextps256_ps512(__m256 __a) /// A 128-bit integer vector. /// \returns A 512-bit integer vector. The lower 128 bits contain the value of /// the parameter. The upper 384 bits are set to zero. -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_zextsi128_si512(__m128i __a) { return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); } -/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector. +/// Constructs a 512-bit integer vector from a 256-bit integer vector. /// The lower 256 bits contain the value of the source vector. The upper /// 256 bits are set to zero. /// @@ -626,20 +624,20 @@ _mm512_zextsi128_si512(__m128i __a) /// A 256-bit integer vector. /// \returns A 512-bit integer vector. The lower 256 bits contain the value of /// the parameter. The upper 256 bits are set to zero. -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_zextsi256_si512(__m256i __a) { return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); } /* Bitwise operators */ -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_and_epi32(__m512i __a, __m512i __b) { return (__m512i)((__v16su)__a & (__v16su)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, @@ -647,20 +645,20 @@ _mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) (__v16si) __src); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) { return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_and_epi64(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a & (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, @@ -668,26 +666,26 @@ _mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) (__v8di) __src); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) { return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_andnot_si512 (__m512i __A, __m512i __B) { - return (__m512i)(~(__v8du)(__A) & (__v8du)__B); + return (__m512i)(~(__v8du)__A & (__v8du)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_andnot_epi32 (__m512i __A, __m512i __B) { - return (__m512i)(~(__v16su)(__A) & (__v16su)__B); + return (__m512i)(~(__v16su)__A & (__v16su)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -695,20 +693,20 @@ _mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), __U, __A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_andnot_epi64(__m512i __A, __m512i __B) { - return (__m512i)(~(__v8du)(__A) & (__v8du)__B); + return (__m512i)(~(__v8du)__A & (__v8du)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -716,20 +714,20 @@ _mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), __U, __A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_or_epi32(__m512i __a, __m512i __b) { return (__m512i)((__v16su)__a | (__v16su)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, @@ -737,19 +735,19 @@ _mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) (__v16si)__src); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) { return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_or_epi64(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a | (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, @@ -757,19 +755,19 @@ _mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) (__v8di)__src); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) { return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_xor_epi32(__m512i __a, __m512i __b) { return (__m512i)((__v16su)__a ^ (__v16su)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, @@ -777,19 +775,19 @@ _mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) (__v16si)__src); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) { return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_xor_epi64(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a ^ (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, @@ -797,25 +795,25 @@ _mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) (__v8di)__src); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) { return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_and_si512(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a & (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_or_si512(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a | (__v8du)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_xor_si512(__m512i __a, __m512i __b) { return (__m512i)((__v8du)__a ^ (__v8du)__b); @@ -823,49 +821,49 @@ _mm512_xor_si512(__m512i __a, __m512i __b) /* Arithmetic */ -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_add_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a + (__v8df)__b); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_add_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a + (__v16sf)__b); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_mul_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a * (__v8df)__b); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_mul_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a * (__v16sf)__b); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_sub_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a - (__v8df)__b); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_sub_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a - (__v16sf)__b); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_add_epi64 (__m512i __A, __m512i __B) { return (__m512i) ((__v8du) __A + (__v8du) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -873,7 +871,7 @@ _mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -881,13 +879,13 @@ _mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sub_epi64 (__m512i __A, __m512i __B) { return (__m512i) ((__v8du) __A - (__v8du) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -895,7 +893,7 @@ _mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -903,13 +901,13 @@ _mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_add_epi32 (__m512i __A, __m512i __B) { return (__m512i) ((__v16su) __A + (__v16su) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -917,7 +915,7 @@ _mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -925,13 +923,13 @@ _mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sub_epi32 (__m512i __A, __m512i __B) { return (__m512i) ((__v16su) __A - (__v16su) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -939,7 +937,7 @@ _mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -947,107 +945,81 @@ _mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) +#define _mm512_max_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R)) -#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) +#define _mm512_mask_max_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_max_round_pd((A), (B), (R)), \ + (__v8df)(W)) -#define _mm512_max_round_pd(A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) +#define _mm512_maskz_max_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_max_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd()) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_max_pd(__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_max_pd(__A, __B), + (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_max_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } -#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }) +#define _mm512_max_round_ps(A, B, R) \ + (__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R)) -#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) +#define _mm512_mask_max_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ + (__v16sf)(W)) -#define _mm512_max_round_ps(A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) +#define _mm512_maskz_max_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps()) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_max_ps(__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_max_ps(__A, __B), + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_max_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, (__v4sf) __B, @@ -1056,7 +1028,7 @@ _mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, (__v4sf) __B, @@ -1065,25 +1037,25 @@ _mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { _MM_FROUND_CUR_DIRECTION); } -#define _mm_max_round_ss(A, B, R) __extension__ ({ \ +#define _mm_max_round_ss(A, B, R) \ (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_max_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_max_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, (__v2df) __B, @@ -1092,7 +1064,7 @@ _mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, (__v2df) __B, @@ -1101,238 +1073,188 @@ _mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { _MM_FROUND_CUR_DIRECTION); } -#define _mm_max_round_sd(A, B, R) __extension__ ({ \ +#define _mm_max_round_sd(A, B, R) \ (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_max_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_max_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline __m512i -__DEFAULT_FN_ATTRS +__DEFAULT_FN_ATTRS512 _mm512_max_epi32(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epi32(__A, __B), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epu32(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epu32(__A, __B), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epu32(__A, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epi64(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epi64(__A, __B), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_max_epu64(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epu64(__A, __B), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epu64(__A, __B), + (__v8di)_mm512_setzero_si512()); } -#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) +#define _mm512_min_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R)) -#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) +#define _mm512_mask_min_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_min_round_pd((A), (B), (R)), \ + (__v8df)(W)) -#define _mm512_min_round_pd(A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) +#define _mm512_maskz_min_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_min_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd()) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_min_pd(__m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_min_pd(__A, __B), + (__v8df)__W); } -#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }) - -#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) - -#define _mm512_min_round_ps(A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) - -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_min_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +#define _mm512_min_round_ps(A, B, R) \ + (__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R)) + +#define _mm512_mask_min_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ + (__v16sf)(W)) + +#define _mm512_maskz_min_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps()) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_min_ps(__m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_min_ps(__A, __B), + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_min_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, (__v4sf) __B, @@ -1341,7 +1263,7 @@ _mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, (__v4sf) __B, @@ -1350,25 +1272,25 @@ _mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { _MM_FROUND_CUR_DIRECTION); } -#define _mm_min_round_ss(A, B, R) __extension__ ({ \ +#define _mm_min_round_ss(A, B, R) \ (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_min_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_min_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, (__v2df) __B, @@ -1377,7 +1299,7 @@ _mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, (__v2df) __B, @@ -1386,144 +1308,120 @@ _mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { _MM_FROUND_CUR_DIRECTION); } -#define _mm_min_round_sd(A, B, R) __extension__ ({ \ +#define _mm_min_round_sd(A, B, R) \ (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_min_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_min_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) static __inline __m512i -__DEFAULT_FN_ATTRS +__DEFAULT_FN_ATTRS512 _mm512_min_epi32(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epi32(__A, __B), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epu32(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epu32(__A, __B), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epu32(__A, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epi64(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epi64(__A, __B), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_min_epu64(__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epu64(__A, __B), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epu64(__A, __B), + (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1531,7 +1429,7 @@ _mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1539,13 +1437,13 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512 ()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1553,7 +1451,7 @@ _mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) (__v8di)__W); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -1561,13 +1459,13 @@ _mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512 ()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mullo_epi32 (__m512i __A, __m512i __B) { return (__m512i) ((__v16su) __A * (__v16su) __B); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1575,7 +1473,7 @@ _mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -1583,92 +1481,91 @@ _mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) (__v16si)__W); } -#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \ - (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mullox_epi64 (__m512i __A, __m512i __B) { + return (__m512i) ((__v8du) __A * (__v8du) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_mullox_epi64(__A, __B), + (__v8di)__W); +} + +#define _mm512_sqrt_round_pd(A, R) \ + (__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)) -#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \ - (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) +#define _mm512_mask_sqrt_round_pd(W, U, A, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sqrt_round_pd((A), (R)), \ + (__v8df)(__m512d)(W)) -#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \ - (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) +#define _mm512_maskz_sqrt_round_pd(U, A, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sqrt_round_pd((A), (R)), \ + (__v8df)_mm512_setzero_pd()) -static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_sqrt_pd(__m512d __a) +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_sqrt_pd(__m512d __A) { - return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a, - (__v8df) _mm512_setzero_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) { - return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_sqrt_pd(__A), + (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) { - return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_sqrt_pd(__A), + (__v8df)_mm512_setzero_pd()); } -#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \ - (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }) +#define _mm512_sqrt_round_ps(A, R) \ + (__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)) -#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \ - (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) +#define _mm512_mask_sqrt_round_ps(W, U, A, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ + (__v16sf)(__m512)(W)) -#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \ - (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) +#define _mm512_maskz_sqrt_round_ps(U, A, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ + (__v16sf)_mm512_setzero_ps()) -static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_sqrt_ps(__m512 __a) +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_sqrt_ps(__m512 __A) { - return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_sqrt_ps(__A), + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) { - return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_sqrt_ps(__A), + (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_rsqrt14_pd(__m512d __A) { return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, @@ -1676,7 +1573,7 @@ _mm512_rsqrt14_pd(__m512d __A) _mm512_setzero_pd (), (__mmask8) -1);} -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, @@ -1684,7 +1581,7 @@ _mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, @@ -1693,7 +1590,7 @@ _mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_rsqrt14_ps(__m512 __A) { return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, @@ -1702,7 +1599,7 @@ _mm512_rsqrt14_ps(__m512 __A) (__mmask16) -1); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, @@ -1710,7 +1607,7 @@ _mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, @@ -1719,7 +1616,7 @@ _mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_rsqrt14_ss(__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, @@ -1729,7 +1626,7 @@ _mm_rsqrt14_ss(__m128 __A, __m128 __B) (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, @@ -1738,7 +1635,7 @@ _mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, @@ -1747,7 +1644,7 @@ _mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_rsqrt14_sd(__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, @@ -1757,7 +1654,7 @@ _mm_rsqrt14_sd(__m128d __A, __m128d __B) (__mmask8) -1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, @@ -1766,7 +1663,7 @@ _mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, @@ -1775,7 +1672,7 @@ _mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_rcp14_pd(__m512d __A) { return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, @@ -1784,7 +1681,7 @@ _mm512_rcp14_pd(__m512d __A) (__mmask8) -1); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, @@ -1792,7 +1689,7 @@ _mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, @@ -1801,7 +1698,7 @@ _mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_rcp14_ps(__m512 __A) { return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, @@ -1810,7 +1707,7 @@ _mm512_rcp14_ps(__m512 __A) (__mmask16) -1); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, @@ -1818,7 +1715,7 @@ _mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, @@ -1827,7 +1724,7 @@ _mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_rcp14_ss(__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, @@ -1837,7 +1734,7 @@ _mm_rcp14_ss(__m128 __A, __m128 __B) (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, @@ -1846,7 +1743,7 @@ _mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, @@ -1855,7 +1752,7 @@ _mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_rcp14_sd(__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, @@ -1865,7 +1762,7 @@ _mm_rcp14_sd(__m128d __A, __m128d __B) (__mmask8) -1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, @@ -1874,7 +1771,7 @@ _mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, @@ -1883,7 +1780,7 @@ _mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) (__mmask8) __U); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_floor_ps(__m512 __A) { return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, @@ -1892,7 +1789,7 @@ _mm512_floor_ps(__m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, @@ -1901,7 +1798,7 @@ _mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_floor_pd(__m512d __A) { return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, @@ -1910,7 +1807,7 @@ _mm512_floor_pd(__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, @@ -1919,7 +1816,7 @@ _mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, @@ -1928,7 +1825,7 @@ _mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_ceil_ps(__m512 __A) { return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, @@ -1937,7 +1834,7 @@ _mm512_ceil_ps(__m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_ceil_pd(__m512d __A) { return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, @@ -1946,7 +1843,7 @@ _mm512_ceil_pd(__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, @@ -1955,758 +1852,672 @@ _mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_abs_epi64(__m512i __A) { - return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_pabsq512((__v8di)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_abs_epi64(__A), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_abs_epi64(__A), + (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_abs_epi32(__m512i __A) { - return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_pabsd512((__v16si) __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_abs_epi32(__A), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) { - return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_abs_epi32(__A), + (__v16si)_mm512_setzero_si512()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_add_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_add_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } -#define _mm_add_round_ss(A, B, R) __extension__ ({ \ +#define _mm_add_round_ss(A, B, R) \ (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_add_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_add_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_add_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_add_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } -#define _mm_add_round_sd(A, B, R) __extension__ ({ \ +#define _mm_add_round_sd(A, B, R) \ (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_add_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_add_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_add_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_add_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -#define _mm512_add_round_pd(A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) - -#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) - -#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) - -#define _mm512_add_round_ps(A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) - -#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }) - -#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) - -static __inline__ __m128 __DEFAULT_FN_ATTRS +#define _mm512_add_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R)) + +#define _mm512_mask_add_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_add_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W)); + +#define _mm512_maskz_add_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_add_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd()); + +#define _mm512_add_round_ps(A, B, R) \ + (__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R)) + +#define _mm512_mask_add_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W)); + +#define _mm512_maskz_add_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps()); + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_sub_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_sub_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } -#define _mm_sub_round_ss(A, B, R) __extension__ ({ \ +#define _mm_sub_round_ss(A, B, R) \ (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_sub_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_sub_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_sub_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_sub_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } -#define _mm_sub_round_sd(A, B, R) __extension__ ({ \ +#define _mm_sub_round_sd(A, B, R) \ (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_sub_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_sub_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_sub_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_sub_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) - -#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) - -#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) - -#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) - -#define _mm512_mask_sub_round_ps(W, U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }); - -#define _mm512_maskz_sub_round_ps(U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }); - -static __inline__ __m128 __DEFAULT_FN_ATTRS +#define _mm512_sub_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R)) + +#define _mm512_mask_sub_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W)); + +#define _mm512_maskz_sub_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd()); + +#define _mm512_sub_round_ps(A, B, R) \ + (__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R)) + +#define _mm512_mask_sub_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W)); + +#define _mm512_maskz_sub_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps()); + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_mul_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_mul_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } -#define _mm_mul_round_ss(A, B, R) __extension__ ({ \ +#define _mm_mul_round_ss(A, B, R) \ (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_mul_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_mul_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_mul_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_mul_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } -#define _mm_mul_round_sd(A, B, R) __extension__ ({ \ +#define _mm_mul_round_sd(A, B, R) \ (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_mul_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_mul_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_mul_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_mul_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) - -#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) - -#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) - -#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) - -#define _mm512_mask_mul_round_ps(W, U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }); - -#define _mm512_maskz_mul_round_ps(U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }); - -static __inline__ __m128 __DEFAULT_FN_ATTRS +#define _mm512_mul_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R)) + +#define _mm512_mask_mul_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W)); + +#define _mm512_maskz_mul_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd()); + +#define _mm512_mul_round_ps(A, B, R) \ + (__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R)) + +#define _mm512_mask_mul_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W)); + +#define _mm512_maskz_mul_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps()); + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); } -#define _mm_div_round_ss(A, B, R) __extension__ ({ \ +#define _mm_div_round_ss(A, B, R) \ (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_div_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_div_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + __A = _mm_div_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); } -#define _mm_div_round_sd(A, B, R) __extension__ ({ \ +#define _mm_div_round_sd(A, B, R) \ (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_div_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_div_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_div_pd(__m512d __a, __m512d __b) { return (__m512d)((__v8df)__a/(__v8df)__b); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)_mm512_div_pd(__A, __B), (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_div_ps(__m512 __a, __m512 __b) { return (__m512)((__v16sf)__a/(__v16sf)__b); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)_mm512_div_ps(__A, __B), (__v16sf)_mm512_setzero_ps()); } -#define _mm512_div_round_pd(A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) - -#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R)); }) - -#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \ - (__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) - -#define _mm512_div_round_ps(A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) - -#define _mm512_mask_div_round_ps(W, U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R)); }); - -#define _mm512_maskz_div_round_ps(U, A, B, R) __extension__ ({ \ - (__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }); - -#define _mm512_roundscale_ps(A, B) __extension__ ({ \ +#define _mm512_div_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R)) + +#define _mm512_mask_div_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_div_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W)); + +#define _mm512_maskz_div_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_div_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd()); + +#define _mm512_div_round_ps(A, B, R) \ + (__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R)) + +#define _mm512_mask_div_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W)); + +#define _mm512_maskz_div_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps()); + +#define _mm512_roundscale_ps(A, B) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)(__m512)(A), (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\ +#define _mm512_mask_roundscale_ps(A, B, C, imm) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ (__v16sf)(__m512)(A), (__mmask16)(B), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\ +#define _mm512_maskz_roundscale_ps(A, B, imm) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(A), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \ +#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ (__v16sf)(__m512)(A), (__mmask16)(B), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \ +#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), (int)(R)); }) + (__mmask16)(A), (int)(R)) -#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \ +#define _mm512_roundscale_round_ps(A, imm, R) \ (__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_roundscale_pd(A, B) __extension__ ({ \ +#define _mm512_roundscale_pd(A, B) \ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)(__m512d)(A), (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\ +#define _mm512_mask_roundscale_pd(A, B, C, imm) \ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ (__v8df)(__m512d)(A), (__mmask8)(B), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\ +#define _mm512_maskz_roundscale_pd(A, B, imm) \ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)(A), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \ +#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ (__v8df)(__m512d)(A), (__mmask8)(B), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \ +#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), (int)(R)); }) + (__mmask8)(A), (int)(R)) -#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \ +#define _mm512_roundscale_round_pd(A, imm, R) \ (__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_fmadd_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), (__mmask8)-1, \ - (int)(R)); }) + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) -#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_fmsub_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_fnmadd_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), (__mmask8)-1, \ - (int)(R)); }) + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R)) -#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_fnmsub_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, @@ -2716,7 +2527,7 @@ _mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, @@ -2726,7 +2537,7 @@ _mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, @@ -2736,7 +2547,7 @@ _mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, @@ -2746,7 +2557,7 @@ _mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, @@ -2756,7 +2567,7 @@ _mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, @@ -2766,7 +2577,7 @@ _mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, @@ -2776,17 +2587,17 @@ _mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, - (__v8df) __B, + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, (__v8df) __C, (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, @@ -2796,7 +2607,7 @@ _mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, @@ -2806,17 +2617,17 @@ _mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A, - (__v8df) __B, + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, -(__v8df) __C, (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, @@ -2826,91 +2637,91 @@ _mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \ +#define _mm512_fmadd_round_ps(A, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), (__mmask16)-1, \ - (int)(R)); }) + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) -#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ (__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \ +#define _mm512_fmsub_round_ps(A, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), (__mmask16)-1, \ - (int)(R)); }) +#define _mm512_fnmadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R)) -#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ (__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ +#define _mm512_fnmsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, @@ -2920,7 +2731,7 @@ _mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, @@ -2930,7 +2741,7 @@ _mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, @@ -2940,7 +2751,7 @@ _mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, @@ -2950,7 +2761,7 @@ _mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, @@ -2960,7 +2771,7 @@ _mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, @@ -2970,7 +2781,7 @@ _mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, @@ -2980,17 +2791,17 @@ _mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, - (__v16sf) __B, + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, (__v16sf) __C, (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, @@ -3000,7 +2811,7 @@ _mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, @@ -3010,17 +2821,17 @@ _mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A, - (__v16sf) __B, + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, -(__v16sf) __C, (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, @@ -3030,96 +2841,96 @@ _mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_fmaddsub_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_fmsubadd_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, @@ -3129,7 +2940,7 @@ _mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, @@ -3139,7 +2950,7 @@ _mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) { return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, @@ -3149,56 +2960,56 @@ _mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \ +#define _mm512_fmaddsub_round_ps(A, B, C, R) \ (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ (__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \ +#define _mm512_fmsubadd_round_ps(A, B, C, R) \ (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \ +#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ (__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, @@ -3208,7 +3019,7 @@ _mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, @@ -3218,7 +3029,7 @@ _mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, @@ -3228,7 +3039,7 @@ _mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, @@ -3238,7 +3049,7 @@ _mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, @@ -3248,7 +3059,7 @@ _mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, @@ -3258,7 +3069,7 @@ _mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) { return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, @@ -3268,337 +3079,309 @@ _mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ (__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ (__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) - + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ (__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) { - return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \ +#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ (__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) +#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) +#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) +#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) -#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \ - (__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R)); }) +#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) { - return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, (__v8df) __B, (__v8df) __C, (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} +#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R)) -#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \ + +#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) - - -#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \ - (__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) { - return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) { - return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); } /* Vector permutations */ -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I - /* idx */ , - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, + (__v16si) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, - __m512i __I, __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I - /* idx */ , - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)__I); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A, - __m512i __I, __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I - /* idx */ , - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I - /* idx */ , - (__v8di) __A, - (__v8di) __B, - (__mmask8) -1); + return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, + (__v8di) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I - /* idx */ , - (__v8di) __A, - (__v8di) __B, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)__A); } +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)__I); +} -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, - __m512i __I, __m512i __B) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I - /* idx */ , - (__v8di) __A, - (__v8di) __B, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)_mm512_setzero_si512()); } -#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(A), \ - ((int)(I) & 0x7) + 0, \ - ((int)(I) & 0x7) + 1, \ - ((int)(I) & 0x7) + 2, \ - ((int)(I) & 0x7) + 3, \ - ((int)(I) & 0x7) + 4, \ - ((int)(I) & 0x7) + 5, \ - ((int)(I) & 0x7) + 6, \ - ((int)(I) & 0x7) + 7); }) +#define _mm512_alignr_epi64(A, B, I) \ + (__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I)) -#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\ +#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)(__m512i)(W)); }) + (__v8di)(__m512i)(W)) -#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\ +#define _mm512_maskz_alignr_epi64(U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()); }) - -#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(A), \ - ((int)(I) & 0xf) + 0, \ - ((int)(I) & 0xf) + 1, \ - ((int)(I) & 0xf) + 2, \ - ((int)(I) & 0xf) + 3, \ - ((int)(I) & 0xf) + 4, \ - ((int)(I) & 0xf) + 5, \ - ((int)(I) & 0xf) + 6, \ - ((int)(I) & 0xf) + 7, \ - ((int)(I) & 0xf) + 8, \ - ((int)(I) & 0xf) + 9, \ - ((int)(I) & 0xf) + 10, \ - ((int)(I) & 0xf) + 11, \ - ((int)(I) & 0xf) + 12, \ - ((int)(I) & 0xf) + 13, \ - ((int)(I) & 0xf) + 14, \ - ((int)(I) & 0xf) + 15); }) - -#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\ + (__v8di)_mm512_setzero_si512()) + +#define _mm512_alignr_epi32(A, B, I) \ + (__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I)) + +#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)(__m512i)(W)); }) + (__v16si)(__m512i)(W)) -#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\ +#define _mm512_maskz_alignr_epi32(U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()); }) + (__v16si)_mm512_setzero_si512()) /* Vector Extract */ -#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \ - (__v8df)_mm512_undefined_pd(), \ - ((I) & 1) ? 4 : 0, \ - ((I) & 1) ? 5 : 1, \ - ((I) & 1) ? 6 : 2, \ - ((I) & 1) ? 7 : 3); }) - -#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ - (__v4df)(W)); }) - -#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\ - (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm512_extractf64x4_pd((A), (imm)), \ - (__v4df)_mm256_setzero_pd()); }) - -#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \ - (__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - 0 + ((I) & 0x3) * 4, \ - 1 + ((I) & 0x3) * 4, \ - 2 + ((I) & 0x3) * 4, \ - 3 + ((I) & 0x3) * 4); }) - -#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ - (__v4sf)(W)); }) - -#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm512_extractf32x4_ps((A), (imm)), \ - (__v4sf)_mm_setzero_ps()); }) +#define _mm512_extractf64x4_pd(A, I) \ + (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ + (__v4df)_mm256_undefined_pd(), \ + (__mmask8)-1) + +#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ + (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extractf64x4_pd(U, A, imm) \ + (__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U)) + +#define _mm512_extractf32x4_ps(A, I) \ + (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1) + +#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ + (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extractf32x4_ps(U, A, imm) \ + (__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U)) /* Vector Blend */ -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) { return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, @@ -3606,7 +3389,7 @@ _mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) (__v8df) __A); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) { return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, @@ -3614,7 +3397,7 @@ _mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) (__v16sf) __A); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) { return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, @@ -3622,7 +3405,7 @@ _mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) (__v8di) __A); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) { return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, @@ -3632,15 +3415,15 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) /* Compare */ -#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \ +#define _mm512_cmp_round_ps_mask(A, B, P, R) \ (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(P), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \ +#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ (__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), (int)(P), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) #define _mm512_cmp_ps_mask(A, B, P) \ _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) @@ -3687,15 +3470,15 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) #define _mm512_mask_cmpord_ps_mask(k, A, B) \ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) -#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \ +#define _mm512_cmp_round_pd_mask(A, B, P, R) \ (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(P), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \ +#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ (__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), (int)(P), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) #define _mm512_cmp_pd_mask(A, B, P) \ _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) @@ -3744,23 +3527,23 @@ _mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) /* Conversion */ -#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundps_epu32(A, R) \ (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_undefined_epi32(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ (__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epu32(__m512 __A) { return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, @@ -3770,7 +3553,7 @@ _mm512_cvttps_epu32(__m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, @@ -3779,7 +3562,7 @@ _mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, @@ -3788,70 +3571,65 @@ _mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \ +#define _mm512_cvt_roundepi32_ps(A, R) \ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \ +#define _mm512_cvt_roundepu32_ps(A, R) \ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtepu32_ps (__m512i __A) { - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) { - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepu32_ps(__A), + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) { - return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepu32_ps(__A), + (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepi32_pd(__m256i __A) { return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -3859,7 +3637,7 @@ _mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -3867,52 +3645,47 @@ _mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepi32lo_pd(__m512i __A) { return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) { return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtepi32_ps (__m512i __A) { - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, - (__v16sf) _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) { - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepi32_ps(__A), + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) { - return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepi32_ps(__A), + (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepu32_pd(__m256i __A) { return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -3920,7 +3693,7 @@ _mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -3928,34 +3701,34 @@ _mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtepu32lo_pd(__m512i __A) { return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) { return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); } -#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \ +#define _mm512_cvt_roundpd_ps(A, R) \ (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ (__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ps (__m512d __A) { return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, @@ -3964,7 +3737,7 @@ _mm512_cvtpd_ps (__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) { return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, @@ -3973,7 +3746,7 @@ _mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) { return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, @@ -3982,7 +3755,7 @@ _mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpd_pslo (__m512d __A) { return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), @@ -3990,7 +3763,7 @@ _mm512_cvtpd_pslo (__m512d __A) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) { return (__m512) __builtin_shufflevector ( @@ -4000,53 +3773,53 @@ _mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); } -#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \ +#define _mm512_cvt_roundps_ph(A, I) \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)_mm256_undefined_si256(), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \ +#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)(__m256i)(U), \ - (__mmask16)(W)); }) + (__mmask16)(W)) -#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \ +#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W)); }) + (__mmask16)(W)) -#define _mm512_cvtps_ph(A, I) __extension__ ({ \ +#define _mm512_cvtps_ph(A, I) \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \ +#define _mm512_mask_cvtps_ph(U, W, A, I) \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)(__m256i)(U), \ - (__mmask16)(W)); }) + (__mmask16)(W)) -#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\ +#define _mm512_maskz_cvtps_ph(W, A, I) \ (__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W)); }) + (__mmask16)(W)) -#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \ +#define _mm512_cvt_roundph_ps(A, R) \ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundph_ps(U, A, R) \ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtph_ps(__m256i __A) { return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, @@ -4056,7 +3829,7 @@ _mm512_cvtph_ps(__m256i __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) { return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, @@ -4065,7 +3838,7 @@ _mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) { return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, @@ -4074,22 +3847,22 @@ _mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundpd_epi32(A, R) \ (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline __m256i __DEFAULT_FN_ATTRS +static __inline __m256i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epi32(__m512d __a) { return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, @@ -4098,7 +3871,7 @@ _mm512_cvttpd_epi32(__m512d __a) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, @@ -4107,7 +3880,7 @@ _mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, @@ -4116,22 +3889,22 @@ _mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundps_epi32(A, R) \ (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ (__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_cvttps_epi32(__m512 __a) { return (__m512i) @@ -4140,7 +3913,7 @@ _mm512_cvttps_epi32(__m512 __a) (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, @@ -4149,7 +3922,7 @@ _mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, @@ -4158,22 +3931,22 @@ _mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \ +#define _mm512_cvt_roundps_epi32(A, R) \ (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ (__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epi32 (__m512 __A) { return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, @@ -4182,7 +3955,7 @@ _mm512_cvtps_epi32 (__m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, @@ -4191,7 +3964,7 @@ _mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, @@ -4201,22 +3974,22 @@ _mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \ +#define _mm512_cvt_roundpd_epi32(A, R) \ (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ (__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epi32 (__m512d __A) { return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, @@ -4226,7 +3999,7 @@ _mm512_cvtpd_epi32 (__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, @@ -4235,7 +4008,7 @@ _mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, @@ -4245,32 +4018,32 @@ _mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \ +#define _mm512_cvt_roundps_epu32(A, R) \ (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ (__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtps_epu32 ( __m512 __A) { return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ (__v16si)\ - _mm512_undefined_epi32 (),\ + _mm512_undefined_epi32 (), (__mmask16) -1,\ - _MM_FROUND_CUR_DIRECTION);\ + _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, @@ -4279,7 +4052,7 @@ _mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) { return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, @@ -4289,22 +4062,22 @@ _mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \ +#define _mm512_cvt_roundpd_epu32(A, R) \ (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(W), \ - (__mmask8)(U), (int)(R)); }) + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ (__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtpd_epu32 (__m512d __A) { return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, @@ -4314,7 +4087,7 @@ _mm512_cvtpd_epu32 (__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, @@ -4323,7 +4096,7 @@ _mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, @@ -4333,13 +4106,13 @@ _mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ double __DEFAULT_FN_ATTRS +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_cvtsd_f64(__m512d __a) { return __a[0]; } -static __inline__ float __DEFAULT_FN_ATTRS +static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_cvtss_f32(__m512 __a) { return __a[0]; @@ -4347,14 +4120,14 @@ _mm512_cvtss_f32(__m512 __a) /* Unpack and Interleave */ -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_unpackhi_pd(__m512d __a, __m512d __b) { return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -4362,7 +4135,7 @@ _mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -4370,14 +4143,14 @@ _mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) (__v8df)_mm512_setzero_pd()); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_unpacklo_pd(__m512d __a, __m512d __b) { return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -4385,7 +4158,7 @@ _mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, @@ -4393,7 +4166,7 @@ _mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_unpackhi_ps(__m512 __a, __m512 __b) { return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, @@ -4403,7 +4176,7 @@ _mm512_unpackhi_ps(__m512 __a, __m512 __b) 2+12, 18+12, 3+12, 19+12); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, @@ -4411,7 +4184,7 @@ _mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, @@ -4419,7 +4192,7 @@ _mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) (__v16sf)_mm512_setzero_ps()); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_unpacklo_ps(__m512 __a, __m512 __b) { return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, @@ -4429,7 +4202,7 @@ _mm512_unpacklo_ps(__m512 __a, __m512 __b) 0+12, 16+12, 1+12, 17+12); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, @@ -4437,7 +4210,7 @@ _mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) { return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, @@ -4445,7 +4218,7 @@ _mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpackhi_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, @@ -4455,7 +4228,7 @@ _mm512_unpackhi_epi32(__m512i __A, __m512i __B) 2+12, 18+12, 3+12, 19+12); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, @@ -4463,7 +4236,7 @@ _mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, @@ -4471,7 +4244,7 @@ _mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpacklo_epi32(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, @@ -4481,7 +4254,7 @@ _mm512_unpacklo_epi32(__m512i __A, __m512i __B) 0+12, 16+12, 1+12, 17+12); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, @@ -4489,7 +4262,7 @@ _mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, @@ -4497,14 +4270,14 @@ _mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpackhi_epi64(__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, @@ -4512,7 +4285,7 @@ _mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, @@ -4520,14 +4293,14 @@ _mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_unpacklo_epi64 (__m512i __A, __m512i __B) { return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, @@ -4535,7 +4308,7 @@ _mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, @@ -4546,16 +4319,16 @@ _mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) /* SIMD load ops */ -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_loadu_si512 (void const *__P) { - return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + struct __loadu_si512 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + return ((struct __loadu_si512*)__P)->__v; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, @@ -4564,7 +4337,7 @@ _mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, @@ -4573,7 +4346,7 @@ _mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, @@ -4581,7 +4354,7 @@ _mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, @@ -4590,7 +4363,7 @@ _mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) { return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, @@ -4598,7 +4371,7 @@ _mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) { return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, @@ -4607,7 +4380,7 @@ _mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) { return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, @@ -4615,7 +4388,7 @@ _mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) { return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, @@ -4624,7 +4397,7 @@ _mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_loadu_pd(void const *__p) { struct __loadu_pd { @@ -4633,7 +4406,7 @@ _mm512_loadu_pd(void const *__p) return ((struct __loadu_pd*)__p)->__v; } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_loadu_ps(void const *__p) { struct __loadu_ps { @@ -4642,16 +4415,13 @@ _mm512_loadu_ps(void const *__p) return ((struct __loadu_ps*)__p)->__v; } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_load_ps(void const *__p) { - return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); + return *(__m512*)__p; } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) { return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, @@ -4659,7 +4429,7 @@ _mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_load_ps(__mmask16 __U, void const *__P) { return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, @@ -4668,16 +4438,13 @@ _mm512_maskz_load_ps(__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_load_pd(void const *__p) { - return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); + return *(__m512d*)__p; } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) { return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, @@ -4685,7 +4452,7 @@ _mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_load_pd(__mmask8 __U, void const *__P) { return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, @@ -4694,19 +4461,19 @@ _mm512_maskz_load_pd(__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_load_si512 (void const *__P) { return *(__m512i *) __P; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_load_epi32 (void const *__P) { return *(__m512i *) __P; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_load_epi64 (void const *__P) { return *(__m512i *) __P; @@ -4714,90 +4481,98 @@ _mm512_load_epi64 (void const *__P) /* SIMD store ops */ -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) { __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, (__mmask8) __U); } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_si512 (void *__P, __m512i __A) { - __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A, - (__mmask16) -1); + struct __storeu_si512 { + __m512i __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si512*)__P)->__v = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) { __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, (__mmask16) __U); } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) { __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_pd(void *__P, __m512d __A) { - __builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1); + struct __storeu_pd { + __m512d __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pd*)__P)->__v = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) { __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, (__mmask16) __U); } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_storeu_ps(void *__P, __m512 __A) { - __builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1); + struct __storeu_ps { + __m512 __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ps*)__P)->__v = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) { __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_store_pd(void *__P, __m512d __A) { *(__m512d*)__P = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) { __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, (__mmask16) __U); } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_store_ps(void *__P, __m512 __A) { *(__m512*)__P = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_store_si512 (void *__P, __m512i __A) { *(__m512i *) __P = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_store_epi32 (void *__P, __m512i __A) { *(__m512i *) __P = __A; } -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS512 _mm512_store_epi64 (void *__P, __m512i __A) { *(__m512i *) __P = __A; @@ -4805,7 +4580,7 @@ _mm512_store_epi64 (void *__P, __m512i __A) /* Mask ops */ -static __inline __mmask16 __DEFAULT_FN_ATTRS +static __inline __mmask16 __DEFAULT_FN_ATTRS512 _mm512_knot(__mmask16 __M) { return __builtin_ia32_knothi(__M); @@ -4913,7 +4688,7 @@ _mm512_knot(__mmask16 __M) #define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi8_epi32(__m128i __A) { /* This function always performs a signed extension, but __v16qi is a char @@ -4921,7 +4696,7 @@ _mm512_cvtepi8_epi32(__m128i __A) return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -4929,7 +4704,7 @@ _mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -4937,7 +4712,7 @@ _mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi8_epi64(__m128i __A) { /* This function always performs a signed extension, but __v16qi is a char @@ -4945,7 +4720,7 @@ _mm512_cvtepi8_epi64(__m128i __A) return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -4953,7 +4728,7 @@ _mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -4961,13 +4736,13 @@ _mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi32_epi64(__m256i __X) { return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -4975,7 +4750,7 @@ _mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -4983,13 +4758,13 @@ _mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi16_epi32(__m256i __A) { return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -4997,7 +4772,7 @@ _mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5005,13 +4780,13 @@ _mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) (__v16si)_mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepi16_epi64(__m128i __A) { return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5019,7 +4794,7 @@ _mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5027,13 +4802,13 @@ _mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepu8_epi32(__m128i __A) { return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5041,7 +4816,7 @@ _mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5049,13 +4824,13 @@ _mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepu8_epi64(__m128i __A) { return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5063,7 +4838,7 @@ _mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5071,13 +4846,13 @@ _mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepu32_epi64(__m256i __X) { return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5085,7 +4860,7 @@ _mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5093,13 +4868,13 @@ _mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepu16_epi32(__m256i __A) { return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5107,7 +4882,7 @@ _mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5115,13 +4890,13 @@ _mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_cvtepu16_epi64(__m128i __A) { return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5129,7 +4904,7 @@ _mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5137,228 +4912,195 @@ _mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rorv_epi32 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rorv_epi32(__A, __B), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rorv_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rorv_epi64 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rorv_epi64(__A, __B), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rorv_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } -#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epi32_mask(a, b, p) \ (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epu32_mask(a, b, p) \ (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epi64_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \ +#define _mm512_cmp_epu64_mask(a, b, p) \ (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)(m)); }) + (__mmask16)(m)) -#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)(m)); }) + (__mmask16)(m)) -#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \ +#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)(m)); }) - -#define _mm512_rol_epi32(a, b) __extension__ ({ \ - (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) - -#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \ - (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) - -#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \ - (__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) - -#define _mm512_rol_epi64(a, b) __extension__ ({ \ - (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) - -#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \ - (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \ - (__v8di)(__m512i)(W), (__mmask8)(U)); }) - -#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \ - (__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) -static __inline__ __m512i __DEFAULT_FN_ATTRS + (__mmask8)(m)) + +#define _mm512_rol_epi32(a, b) \ + (__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)) + +#define _mm512_mask_rol_epi32(W, U, a, b) \ + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)(__m512i)(W)) + +#define _mm512_maskz_rol_epi32(U, a, b) \ + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)_mm512_setzero_si512()) + +#define _mm512_rol_epi64(a, b) \ + (__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)) + +#define _mm512_mask_rol_epi64(W, U, a, b) \ + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)(__m512i)(W)) + +#define _mm512_maskz_rol_epi64(U, a, b) \ + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)_mm512_setzero_si512()) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rolv_epi32 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) __W, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rolv_epi32(__A, __B), + (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, - (__v16si) __B, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rolv_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_rolv_epi64 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) __W, - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rolv_epi64(__A, __B), + (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, - (__v8di) __B, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rolv_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); } -#define _mm512_ror_epi32(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1); }) +#define _mm512_ror_epi32(A, B) \ + (__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)) -#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U)); }) +#define _mm512_mask_ror_epi32(W, U, A, B) \ + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)(__m512i)(W)) -#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U)); }) +#define _mm512_maskz_ror_epi32(U, A, B) \ + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)_mm512_setzero_si512()) -#define _mm512_ror_epi64(A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1); }) +#define _mm512_ror_epi64(A, B) \ + (__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)) -#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)(__m512i)(W), (__mmask8)(U)); }) +#define _mm512_mask_ror_epi64(W, U, A, B) \ + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)(__m512i)(W)) -#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \ - (__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U)); }) +#define _mm512_maskz_ror_epi64(U, A, B) \ + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)_mm512_setzero_si512()) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_slli_epi32(__m512i __A, int __B) { return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5366,20 +5108,20 @@ _mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_slli_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_slli_epi64(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5387,7 +5129,7 @@ _mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5395,13 +5137,13 @@ _mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srli_epi32(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5409,20 +5151,20 @@ _mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, (__v16si)_mm512_srli_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srli_epi64(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5430,7 +5172,7 @@ _mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5438,7 +5180,7 @@ _mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, @@ -5446,7 +5188,7 @@ _mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, @@ -5455,14 +5197,14 @@ _mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) { __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, @@ -5470,7 +5212,7 @@ _mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) (__v16si) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, @@ -5478,7 +5220,7 @@ _mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) (__v16si) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, @@ -5486,7 +5228,7 @@ _mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) (__v8di) __W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) { return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, @@ -5494,7 +5236,7 @@ _mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) (__v8di) _mm512_setzero_si512 ()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, @@ -5502,7 +5244,7 @@ _mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, @@ -5511,21 +5253,21 @@ _mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) { __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_movedup_pd (__m512d __A) { return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, 0, 0, 2, 2, 4, 4, 6, 6); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, @@ -5533,7 +5275,7 @@ _mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, @@ -5541,179 +5283,179 @@ _mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) (__v8df)_mm512_setzero_pd()); } -#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \ +#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \ +#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \ +#define _mm512_fixupimm_pd(A, B, C, imm) \ (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \ +#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ (__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \ +#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8di)(__m512i)(C), \ (int)(imm), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \ +#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ (__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8di)(__m512i)(C), \ (int)(imm), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \ +#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \ +#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \ +#define _mm512_fixupimm_ps(A, B, C, imm) \ (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16si)(__m512i)(C), (int)(imm), \ (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \ +#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ (__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16si)(__m512i)(C), (int)(imm), \ (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \ +#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16si)(__m512i)(C), \ (int)(imm), (__mmask16)(U), \ - (int)(R)); }) + (int)(R)) -#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \ +#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ (__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16si)(__m512i)(C), \ (int)(imm), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \ +#define _mm_fixupimm_round_sd(A, B, C, imm, R) \ (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \ +#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \ +#define _mm_fixupimm_sd(A, B, C, imm) \ (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \ +#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ (__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \ +#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \ +#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ (__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \ +#define _mm_fixupimm_round_ss(A, B, C, imm, R) \ (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \ +#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \ +#define _mm_fixupimm_ss(A, B, C, imm) \ (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \ +#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ (__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \ +#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \ +#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ (__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \ +#define _mm_getexp_round_sd(A, B, R) \ (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_getexp_sd (__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, @@ -5723,13 +5465,13 @@ _mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\ +#define _mm_mask_getexp_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, @@ -5739,26 +5481,26 @@ _mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\ +#define _mm_maskz_getexp_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \ +#define _mm_getexp_round_ss(A, B, R) \ (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_getexp_ss (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, @@ -5768,155 +5510,144 @@ _mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\ +#define _mm_mask_getexp_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, (__v4sf) __B, - (__v4sf) _mm_setzero_pd (), + (__v4sf) _mm_setzero_ps (), (__mmask8) __U, _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\ +#define _mm_maskz_getexp_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \ +#define _mm_getmant_round_sd(A, B, C, D, R) \ (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (int)(((D)<<2) | (C)), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_getmant_sd(A, B, C, D) __extension__ ({ \ +#define _mm_getmant_sd(A, B, C, D) \ (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (int)(((D)<<2) | (C)), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\ +#define _mm_mask_getmant_sd(W, U, A, B, C, D) \ (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (int)(((D)<<2) | (C)), \ (__v2df)(__m128d)(W), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\ +#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (int)(((D)<<2) | (C)), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\ +#define _mm_maskz_getmant_sd(U, A, B, C, D) \ (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (int)(((D)<<2) | (C)), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\ +#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ (__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (int)(((D)<<2) | (C)), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \ +#define _mm_getmant_round_ss(A, B, C, D, R) \ (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (int)(((D)<<2) | (C)), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \ +#define _mm_getmant_ss(A, B, C, D) \ (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (int)(((D)<<2) | (C)), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\ +#define _mm_mask_getmant_ss(W, U, A, B, C, D) \ (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (int)(((D)<<2) | (C)), \ (__v4sf)(__m128)(W), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\ +#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (int)(((D)<<2) | (C)), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\ +#define _mm_maskz_getmant_ss(U, A, B, C, D) \ (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_pd(), \ + (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\ +#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ (__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (int)(((D)<<2) | (C)), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kmov (__mmask16 __A) { return __A; } -#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\ +#define _mm_comi_round_sd(A, B, P, R) \ (int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ - (int)(P), (int)(R)); }) + (int)(P), (int)(R)) -#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\ +#define _mm_comi_round_ss(A, B, P, R) \ (int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ - (int)(P), (int)(R)); }) + (int)(P), (int)(R)) #ifdef __x86_64__ -#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvt_roundsd_si64(A, R) \ + (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, - __mmask16 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A, - (__v16si) __I - /* idx */ , - (__v16si) __B, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sll_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5924,7 +5655,7 @@ _mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5932,13 +5663,13 @@ _mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sll_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5946,7 +5677,7 @@ _mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5954,13 +5685,13 @@ _mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sllv_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5968,7 +5699,7 @@ _mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -5976,13 +5707,13 @@ _mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sllv_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5990,7 +5721,7 @@ _mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -5998,13 +5729,13 @@ _mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sra_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6012,7 +5743,7 @@ _mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6020,13 +5751,13 @@ _mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_sra_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6034,7 +5765,7 @@ _mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6042,13 +5773,13 @@ _mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srav_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6056,7 +5787,7 @@ _mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6064,13 +5795,13 @@ _mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srav_epi64(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6078,7 +5809,7 @@ _mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6086,13 +5817,13 @@ _mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srl_epi32(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6100,7 +5831,7 @@ _mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6108,13 +5839,13 @@ _mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srl_epi64(__m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6122,7 +5853,7 @@ _mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6130,13 +5861,13 @@ _mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srlv_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6144,7 +5875,7 @@ _mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, @@ -6152,13 +5883,13 @@ _mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srlv_epi64 (__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6166,7 +5897,7 @@ _mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, @@ -6174,57 +5905,57 @@ _mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512()); } -#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \ +#define _mm512_ternarylogic_epi32(A, B, C, imm) \ (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), \ (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \ +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ (__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), \ (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U)); }) + (__mmask16)(U)) -#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \ +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ (__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ (__v16si)(__m512i)(B), \ (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U)); }) + (int)(imm), (__mmask16)(U)) -#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \ +#define _mm512_ternarylogic_epi64(A, B, C, imm) \ (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \ +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ (__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \ +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ (__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), \ (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) #ifdef __x86_64__ -#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvt_roundsd_i64(A, R) \ + (long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)) #endif -#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvt_roundsd_si32(A, R) \ + (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) -#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvt_roundsd_i32(A, R) \ + (int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)) -#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \ - (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvt_roundsd_u32(A, R) \ + (unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)) -static __inline__ unsigned __DEFAULT_FN_ATTRS +static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvtsd_u32 (__m128d __A) { return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, @@ -6232,11 +5963,11 @@ _mm_cvtsd_u32 (__m128d __A) } #ifdef __x86_64__ -#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \ +#define _mm_cvt_roundsd_u64(A, R) \ (unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ - (int)(R)); }) + (int)(R)) -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvtsd_u64 (__m128d __A) { return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) @@ -6245,24 +5976,24 @@ _mm_cvtsd_u64 (__m128d __A) } #endif -#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvt_roundss_si32(A, R) \ + (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) -#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvt_roundss_i32(A, R) \ + (int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)) #ifdef __x86_64__ -#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvt_roundss_si64(A, R) \ + (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) -#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvt_roundss_i64(A, R) \ + (long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)) #endif -#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \ - (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvt_roundss_u32(A, R) \ + (unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)) -static __inline__ unsigned __DEFAULT_FN_ATTRS +static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvtss_u32 (__m128 __A) { return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, @@ -6270,11 +6001,11 @@ _mm_cvtss_u32 (__m128 __A) } #ifdef __x86_64__ -#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \ +#define _mm_cvt_roundss_u64(A, R) \ (unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ - (int)(R)); }) + (int)(R)) -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvtss_u64 (__m128 __A) { return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) @@ -6283,13 +6014,13 @@ _mm_cvtss_u64 (__m128 __A) } #endif -#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvtt_roundsd_i32(A, R) \ + (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) -#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvtt_roundsd_si32(A, R) \ + (int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)) -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsd_i32 (__m128d __A) { return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, @@ -6297,13 +6028,13 @@ _mm_cvttsd_i32 (__m128d __A) } #ifdef __x86_64__ -#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvtt_roundsd_si64(A, R) \ + (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) -#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvtt_roundsd_i64(A, R) \ + (long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)) -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsd_i64 (__m128d __A) { return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, @@ -6311,10 +6042,10 @@ _mm_cvttsd_i64 (__m128d __A) } #endif -#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \ - (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); }) +#define _mm_cvtt_roundsd_u32(A, R) \ + (unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)) -static __inline__ unsigned __DEFAULT_FN_ATTRS +static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvttsd_u32 (__m128d __A) { return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, @@ -6322,11 +6053,11 @@ _mm_cvttsd_u32 (__m128d __A) } #ifdef __x86_64__ -#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \ +#define _mm_cvtt_roundsd_u64(A, R) \ (unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ - (int)(R)); }) + (int)(R)) -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvttsd_u64 (__m128d __A) { return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) @@ -6335,13 +6066,13 @@ _mm_cvttsd_u64 (__m128d __A) } #endif -#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvtt_roundss_i32(A, R) \ + (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) -#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \ - (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvtt_roundss_si32(A, R) \ + (int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)) -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttss_i32 (__m128 __A) { return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, @@ -6349,13 +6080,13 @@ _mm_cvttss_i32 (__m128 __A) } #ifdef __x86_64__ -#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvtt_roundss_i64(A, R) \ + (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) -#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \ - (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvtt_roundss_si64(A, R) \ + (long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)) -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttss_i64 (__m128 __A) { return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, @@ -6363,10 +6094,10 @@ _mm_cvttss_i64 (__m128 __A) } #endif -#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \ - (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); }) +#define _mm_cvtt_roundss_u32(A, R) \ + (unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)) -static __inline__ unsigned __DEFAULT_FN_ATTRS +static __inline__ unsigned __DEFAULT_FN_ATTRS128 _mm_cvttss_u32 (__m128 __A) { return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, @@ -6374,11 +6105,11 @@ _mm_cvttss_u32 (__m128 __A) } #ifdef __x86_64__ -#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \ +#define _mm_cvtt_roundss_u64(A, R) \ (unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ - (int)(R)); }) + (int)(R)) -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 _mm_cvttss_u64 (__m128 __A) { return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) @@ -6387,98 +6118,39 @@ _mm_cvttss_u64 (__m128 __A) } #endif -static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) -{ - return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A, - (__v8di) __I - /* idx */ , - (__v8df) __B, - (__mmask8) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U, - __m512 __B) -{ - return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A, - (__v16si) __I - /* idx */ , - (__v16sf) __B, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, - __mmask8 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A, - (__v8di) __I - /* idx */ , - (__v8di) __B, - (__mmask8) __U); -} - -#define _mm512_permute_pd(X, C) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \ - (__v8df)_mm512_undefined_pd(), \ - 0 + (((C) >> 0) & 0x1), \ - 0 + (((C) >> 1) & 0x1), \ - 2 + (((C) >> 2) & 0x1), \ - 2 + (((C) >> 3) & 0x1), \ - 4 + (((C) >> 4) & 0x1), \ - 4 + (((C) >> 5) & 0x1), \ - 6 + (((C) >> 6) & 0x1), \ - 6 + (((C) >> 7) & 0x1)); }) +#define _mm512_permute_pd(X, C) \ + (__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)) -#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \ +#define _mm512_mask_permute_pd(W, U, X, C) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)(__m512d)(W)); }) + (__v8df)(__m512d)(W)) -#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \ +#define _mm512_maskz_permute_pd(U, X, C) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd()); }) - -#define _mm512_permute_ps(X, C) __extension__ ({ \ - (__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \ - (__v16sf)_mm512_undefined_ps(), \ - 0 + (((C) >> 0) & 0x3), \ - 0 + (((C) >> 2) & 0x3), \ - 0 + (((C) >> 4) & 0x3), \ - 0 + (((C) >> 6) & 0x3), \ - 4 + (((C) >> 0) & 0x3), \ - 4 + (((C) >> 2) & 0x3), \ - 4 + (((C) >> 4) & 0x3), \ - 4 + (((C) >> 6) & 0x3), \ - 8 + (((C) >> 0) & 0x3), \ - 8 + (((C) >> 2) & 0x3), \ - 8 + (((C) >> 4) & 0x3), \ - 8 + (((C) >> 6) & 0x3), \ - 12 + (((C) >> 0) & 0x3), \ - 12 + (((C) >> 2) & 0x3), \ - 12 + (((C) >> 4) & 0x3), \ - 12 + (((C) >> 6) & 0x3)); }) - -#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \ + (__v8df)_mm512_setzero_pd()) + +#define _mm512_permute_ps(X, C) \ + (__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)) + +#define _mm512_mask_permute_ps(W, U, X, C) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)(__m512)(W)); }) + (__v16sf)(__m512)(W)) -#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \ +#define _mm512_maskz_permute_ps(U, X, C) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)_mm512_setzero_ps()); }) + (__v16sf)_mm512_setzero_ps()) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_permutevar_pd(__m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, @@ -6486,7 +6158,7 @@ _mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, @@ -6494,13 +6166,13 @@ _mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_permutevar_ps(__m512 __A, __m512i __C) { return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -6508,7 +6180,7 @@ _mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -6516,85 +6188,87 @@ _mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) (__v16sf)_mm512_setzero_ps()); } -static __inline __m512d __DEFAULT_FN_ATTRS +static __inline __m512d __DEFAULT_FN_ATTRS512 _mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) { - return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I - /* idx */ , - (__v8df) __A, - (__v8df) __B, - (__mmask8) -1); + return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, + (__v8df)__B); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)__A); } -static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) { - return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I - /* idx */ , - (__v8df) __A, - (__v8df) __B, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)(__m512d)__I); } -static __inline__ __m512d __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) { - return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I - /* idx */ , - (__v8df) __A, - (__v8df) __B, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)_mm512_setzero_pd()); } -static __inline __m512 __DEFAULT_FN_ATTRS +static __inline __m512 __DEFAULT_FN_ATTRS512 _mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) { - return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I - /* idx */ , - (__v16sf) __A, - (__v16sf) __B, - (__mmask16) -1); + return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, + (__v16sf) __B); } -static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) { - return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I - /* idx */ , - (__v16sf) __A, - (__v16sf) __B, - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)__A); } -static __inline__ __m512 __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, - __m512 __B) +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) { - return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I - /* idx */ , - (__v16sf) __A, - (__v16sf) __B, - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)(__m512)__I); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)_mm512_setzero_ps()); } -#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \ +#define _mm512_cvtt_roundpd_epu32(A, R) \ (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_undefined_si256(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ (__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvttpd_epu32 (__m512d __A) { return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, @@ -6604,7 +6278,7 @@ _mm512_cvttpd_epu32 (__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, @@ -6613,7 +6287,7 @@ _mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) { return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, @@ -6623,109 +6297,109 @@ _mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \ +#define _mm_roundscale_round_sd(A, B, imm, R) \ (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)-1, (int)(imm), \ - (int)(R)); }) + (int)(R)) -#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \ +#define _mm_roundscale_sd(A, B, imm) \ (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)-1, (int)(imm), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \ +#define _mm_mask_roundscale_sd(W, U, A, B, imm) \ (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ (__mmask8)(U), (int)(imm), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \ +#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ (__mmask8)(U), (int)(I), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \ +#define _mm_maskz_roundscale_sd(U, A, B, I) \ (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \ +#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ (__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ (__mmask8)(U), (int)(I), \ - (int)(R)); }) + (int)(R)) -#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \ +#define _mm_roundscale_round_ss(A, B, imm, R) \ (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)-1, (int)(imm), \ - (int)(R)); }) + (int)(R)) -#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \ +#define _mm_roundscale_ss(A, B, imm) \ (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)-1, (int)(imm), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \ +#define _mm_mask_roundscale_ss(W, U, A, B, I) \ (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), \ (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \ +#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), \ (__mmask8)(U), (int)(I), \ - (int)(R)); }) + (int)(R)) -#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \ +#define _mm_maskz_roundscale_ss(U, A, B, I) \ (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \ +#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ (__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), (int)(I), \ - (int)(R)); }) + (int)(R)) -#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \ +#define _mm512_scalef_round_pd(A, B, R) \ (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \ +#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \ +#define _mm512_maskz_scalef_round_pd(U, A, B, R) \ (__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(B), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_scalef_pd (__m512d __A, __m512d __B) { return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, @@ -6736,7 +6410,7 @@ _mm512_scalef_pd (__m512d __A, __m512d __B) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, @@ -6746,7 +6420,7 @@ _mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) { return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, @@ -6757,25 +6431,25 @@ _mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \ +#define _mm512_scalef_round_ps(A, B, R) \ (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \ +#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \ +#define _mm512_maskz_scalef_round_ps(U, A, B, R) \ (__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(B), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_scalef_ps (__m512 __A, __m512 __B) { return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, @@ -6786,7 +6460,7 @@ _mm512_scalef_ps (__m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, @@ -6796,7 +6470,7 @@ _mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) { return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, @@ -6807,13 +6481,13 @@ _mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \ +#define _mm_scalef_round_sd(A, B, R) \ (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_scalef_sd (__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, @@ -6822,7 +6496,7 @@ _mm_scalef_sd (__m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, @@ -6832,13 +6506,13 @@ _mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_scalef_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, @@ -6848,19 +6522,19 @@ _mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_scalef_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \ +#define _mm_scalef_round_ss(A, B, R) \ (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_scalef_ss (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, @@ -6869,7 +6543,7 @@ _mm_scalef_ss (__m128 __A, __m128 __B) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, @@ -6879,13 +6553,13 @@ _mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_scalef_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, @@ -6895,211 +6569,147 @@ _mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_scalef_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + (int)(R)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srai_epi32(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \ - (__v16si)_mm512_srai_epi32(__A, __B), \ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srai_epi32(__A, __B), (__v16si)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \ - (__v16si)_mm512_srai_epi32(__A, __B), \ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srai_epi32(__A, __B), (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_srai_epi64(__m512i __A, int __B) { return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \ - (__v8di)_mm512_srai_epi64(__A, __B), \ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srai_epi64(__A, __B), (__v8di)__W); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \ - (__v8di)_mm512_srai_epi64(__A, __B), \ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srai_epi64(__A, __B), (__v8di)_mm512_setzero_si512()); } -#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - 0 + ((((imm) >> 0) & 0x3) * 4), \ - 1 + ((((imm) >> 0) & 0x3) * 4), \ - 2 + ((((imm) >> 0) & 0x3) * 4), \ - 3 + ((((imm) >> 0) & 0x3) * 4), \ - 0 + ((((imm) >> 2) & 0x3) * 4), \ - 1 + ((((imm) >> 2) & 0x3) * 4), \ - 2 + ((((imm) >> 2) & 0x3) * 4), \ - 3 + ((((imm) >> 2) & 0x3) * 4), \ - 16 + ((((imm) >> 4) & 0x3) * 4), \ - 17 + ((((imm) >> 4) & 0x3) * 4), \ - 18 + ((((imm) >> 4) & 0x3) * 4), \ - 19 + ((((imm) >> 4) & 0x3) * 4), \ - 16 + ((((imm) >> 6) & 0x3) * 4), \ - 17 + ((((imm) >> 6) & 0x3) * 4), \ - 18 + ((((imm) >> 6) & 0x3) * 4), \ - 19 + ((((imm) >> 6) & 0x3) * 4)); }) - -#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ +#define _mm512_shuffle_f32x4(A, B, imm) \ + (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(imm)) + +#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W)); }) + (__v16sf)(__m512)(W)) -#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps()); }) - -#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - 0 + ((((imm) >> 0) & 0x3) * 2), \ - 1 + ((((imm) >> 0) & 0x3) * 2), \ - 0 + ((((imm) >> 2) & 0x3) * 2), \ - 1 + ((((imm) >> 2) & 0x3) * 2), \ - 8 + ((((imm) >> 4) & 0x3) * 2), \ - 9 + ((((imm) >> 4) & 0x3) * 2), \ - 8 + ((((imm) >> 6) & 0x3) * 2), \ - 9 + ((((imm) >> 6) & 0x3) * 2)); }) - -#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ + (__v16sf)_mm512_setzero_ps()) + +#define _mm512_shuffle_f64x2(A, B, imm) \ + (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(imm)) + +#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W)); }) + (__v8df)(__m512d)(W)) -#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd()); }) - -#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - 0 + ((((imm) >> 0) & 0x3) * 2), \ - 1 + ((((imm) >> 0) & 0x3) * 2), \ - 0 + ((((imm) >> 2) & 0x3) * 2), \ - 1 + ((((imm) >> 2) & 0x3) * 2), \ - 8 + ((((imm) >> 4) & 0x3) * 2), \ - 9 + ((((imm) >> 4) & 0x3) * 2), \ - 8 + ((((imm) >> 6) & 0x3) * 2), \ - 9 + ((((imm) >> 6) & 0x3) * 2)); }) - -#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ + (__v8df)_mm512_setzero_pd()) + +#define _mm512_shuffle_i32x4(A, B, imm) \ + (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(imm)) + +#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W)); }) + (__v16si)(__m512i)(W)) -#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()); }) - -#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - 0 + ((((imm) >> 0) & 0x3) * 2), \ - 1 + ((((imm) >> 0) & 0x3) * 2), \ - 0 + ((((imm) >> 2) & 0x3) * 2), \ - 1 + ((((imm) >> 2) & 0x3) * 2), \ - 8 + ((((imm) >> 4) & 0x3) * 2), \ - 9 + ((((imm) >> 4) & 0x3) * 2), \ - 8 + ((((imm) >> 6) & 0x3) * 2), \ - 9 + ((((imm) >> 6) & 0x3) * 2)); }) - -#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ + (__v16si)_mm512_setzero_si512()) + +#define _mm512_shuffle_i64x2(A, B, imm) \ + (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(imm)) + +#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W)); }) + (__v8di)(__m512i)(W)) -#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()); }) - -#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - 0 + (((M) >> 0) & 0x1), \ - 8 + (((M) >> 1) & 0x1), \ - 2 + (((M) >> 2) & 0x1), \ - 10 + (((M) >> 3) & 0x1), \ - 4 + (((M) >> 4) & 0x1), \ - 12 + (((M) >> 5) & 0x1), \ - 6 + (((M) >> 6) & 0x1), \ - 14 + (((M) >> 7) & 0x1)); }) - -#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \ + (__v8di)_mm512_setzero_si512()) + +#define _mm512_shuffle_pd(A, B, M) \ + (__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(M)) + +#define _mm512_mask_shuffle_pd(W, U, A, B, M) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)(__m512d)(W)); }) + (__v8df)(__m512d)(W)) -#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \ +#define _mm512_maskz_shuffle_pd(U, A, B, M) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)_mm512_setzero_pd()); }) - -#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - 0 + (((M) >> 0) & 0x3), \ - 0 + (((M) >> 2) & 0x3), \ - 16 + (((M) >> 4) & 0x3), \ - 16 + (((M) >> 6) & 0x3), \ - 4 + (((M) >> 0) & 0x3), \ - 4 + (((M) >> 2) & 0x3), \ - 20 + (((M) >> 4) & 0x3), \ - 20 + (((M) >> 6) & 0x3), \ - 8 + (((M) >> 0) & 0x3), \ - 8 + (((M) >> 2) & 0x3), \ - 24 + (((M) >> 4) & 0x3), \ - 24 + (((M) >> 6) & 0x3), \ - 12 + (((M) >> 0) & 0x3), \ - 12 + (((M) >> 2) & 0x3), \ - 28 + (((M) >> 4) & 0x3), \ - 28 + (((M) >> 6) & 0x3)); }) - -#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \ + (__v8df)_mm512_setzero_pd()) + +#define _mm512_shuffle_ps(A, B, M) \ + (__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(M)) + +#define _mm512_mask_shuffle_ps(W, U, A, B, M) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)(__m512)(W)); }) + (__v16sf)(__m512)(W)) -#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \ +#define _mm512_maskz_shuffle_ps(U, A, B, M) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)_mm512_setzero_ps()); }) + (__v16sf)_mm512_setzero_ps()) -#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \ +#define _mm_sqrt_round_sd(A, B, R) \ (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, @@ -7109,13 +6719,13 @@ _mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, @@ -7125,19 +6735,19 @@ _mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_sqrt_round_sd(U, A, B, R) \ (__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \ +#define _mm_sqrt_round_ss(A, B, R) \ (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, @@ -7147,13 +6757,13 @@ _mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, @@ -7163,13 +6773,13 @@ _mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_sqrt_round_ss(U, A, B, R) \ (__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_broadcast_f32x4(__m128 __A) { return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, @@ -7177,7 +6787,7 @@ _mm512_broadcast_f32x4(__m128 __A) 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -7185,7 +6795,7 @@ _mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) (__v16sf)__O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, @@ -7193,14 +6803,14 @@ _mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_broadcast_f64x4(__m256d __A) { return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, @@ -7208,7 +6818,7 @@ _mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) (__v8df)__O); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) { return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, @@ -7216,7 +6826,7 @@ _mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcast_i32x4(__m128i __A) { return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, @@ -7224,7 +6834,7 @@ _mm512_broadcast_i32x4(__m128i __A) 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -7232,7 +6842,7 @@ _mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) (__v16si)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) { return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, @@ -7240,14 +6850,14 @@ _mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) (__v16si)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_broadcast_i64x4(__m256i __A) { return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -7255,7 +6865,7 @@ _mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) (__v8di)__O); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) { return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, @@ -7263,7 +6873,7 @@ _mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512(__M, @@ -7271,7 +6881,7 @@ _mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) (__v8df) __O); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { return (__m512d)__builtin_ia32_selectpd_512(__M, @@ -7279,7 +6889,7 @@ _mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) (__v8df) _mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512(__M, @@ -7287,7 +6897,7 @@ _mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) (__v16sf) __O); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) { return (__m512)__builtin_ia32_selectps_512(__M, @@ -7295,7 +6905,7 @@ _mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) (__v16sf) _mm512_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtsepi32_epi8 (__m512i __A) { return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, @@ -7303,14 +6913,14 @@ _mm512_cvtsepi32_epi8 (__m512i __A) (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, @@ -7318,13 +6928,13 @@ _mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) { __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsepi32_epi16 (__m512i __A) { return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, @@ -7332,14 +6942,14 @@ _mm512_cvtsepi32_epi16 (__m512i __A) (__mmask16) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, (__v16hi) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, @@ -7347,13 +6957,13 @@ _mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) { __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtsepi64_epi8 (__m512i __A) { return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, @@ -7361,14 +6971,14 @@ _mm512_cvtsepi64_epi8 (__m512i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, @@ -7376,13 +6986,13 @@ _mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtsepi64_epi32 (__m512i __A) { return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, @@ -7390,14 +7000,14 @@ _mm512_cvtsepi64_epi32 (__m512i __A) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, (__v8si) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, @@ -7405,13 +7015,13 @@ _mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtsepi64_epi16 (__m512i __A) { return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, @@ -7419,14 +7029,14 @@ _mm512_cvtsepi64_epi16 (__m512i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, @@ -7434,13 +7044,13 @@ _mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtusepi32_epi8 (__m512i __A) { return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, @@ -7448,7 +7058,7 @@ _mm512_cvtusepi32_epi8 (__m512i __A) (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, @@ -7456,7 +7066,7 @@ _mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, @@ -7464,13 +7074,13 @@ _mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) { __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtusepi32_epi16 (__m512i __A) { return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, @@ -7478,7 +7088,7 @@ _mm512_cvtusepi32_epi16 (__m512i __A) (__mmask16) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, @@ -7486,7 +7096,7 @@ _mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, @@ -7494,13 +7104,13 @@ _mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) { __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtusepi64_epi8 (__m512i __A) { return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, @@ -7508,7 +7118,7 @@ _mm512_cvtusepi64_epi8 (__m512i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, @@ -7516,7 +7126,7 @@ _mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, @@ -7524,13 +7134,13 @@ _mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtusepi64_epi32 (__m512i __A) { return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, @@ -7538,14 +7148,14 @@ _mm512_cvtusepi64_epi32 (__m512i __A) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, (__v8si) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, @@ -7553,13 +7163,13 @@ _mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtusepi64_epi16 (__m512i __A) { return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, @@ -7567,14 +7177,14 @@ _mm512_cvtusepi64_epi16 (__m512i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, @@ -7582,13 +7192,13 @@ _mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtepi32_epi8 (__m512i __A) { return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, @@ -7596,14 +7206,14 @@ _mm512_cvtepi32_epi8 (__m512i __A) (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, @@ -7611,13 +7221,13 @@ _mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) { __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtepi32_epi16 (__m512i __A) { return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, @@ -7625,14 +7235,14 @@ _mm512_cvtepi32_epi16 (__m512i __A) (__mmask16) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, (__v16hi) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, @@ -7640,13 +7250,13 @@ _mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) { __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtepi64_epi8 (__m512i __A) { return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, @@ -7654,14 +7264,14 @@ _mm512_cvtepi64_epi8 (__m512i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, @@ -7669,13 +7279,13 @@ _mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_cvtepi64_epi32 (__m512i __A) { return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, @@ -7683,14 +7293,14 @@ _mm512_cvtepi64_epi32 (__m512i __A) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, (__v8si) __O, __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) { return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, @@ -7698,13 +7308,13 @@ _mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_cvtepi64_epi16 (__m512i __A) { return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, @@ -7712,14 +7322,14 @@ _mm512_cvtepi64_epi16 (__m512i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) { return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, @@ -7727,246 +7337,192 @@ _mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) { __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); } -#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \ - (__v16si)_mm512_undefined_epi32(), \ - 0 + ((imm) & 0x3) * 4, \ - 1 + ((imm) & 0x3) * 4, \ - 2 + ((imm) & 0x3) * 4, \ - 3 + ((imm) & 0x3) * 4); }) - -#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ - (__v4si)(W)); }) - -#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm512_extracti32x4_epi32((A), (imm)), \ - (__v4si)_mm_setzero_si128()); }) - -#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \ - (__v8di)_mm512_undefined_epi32(), \ - ((imm) & 1) ? 4 : 0, \ - ((imm) & 1) ? 5 : 1, \ - ((imm) & 1) ? 6 : 2, \ - ((imm) & 1) ? 7 : 3); }) - -#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ - (__v4di)(W)); }) - -#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm512_extracti64x4_epi64((A), (imm)), \ - (__v4di)_mm256_setzero_si256()); }) - -#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \ - (__v8df)_mm512_castpd256_pd512((__m256d)(B)), \ - ((imm) & 0x1) ? 0 : 8, \ - ((imm) & 0x1) ? 1 : 9, \ - ((imm) & 0x1) ? 2 : 10, \ - ((imm) & 0x1) ? 3 : 11, \ - ((imm) & 0x1) ? 8 : 4, \ - ((imm) & 0x1) ? 9 : 5, \ - ((imm) & 0x1) ? 10 : 6, \ - ((imm) & 0x1) ? 11 : 7); }) - -#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \ +#define _mm512_extracti32x4_epi32(A, imm) \ + (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1) + +#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ + (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ + (__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U)) + +#define _mm512_extracti64x4_epi64(A, imm) \ + (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_undefined_si256(), \ + (__mmask8)-1) + +#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ + (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U)) + +#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ + (__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U)) + +#define _mm512_insertf64x4(A, B, imm) \ + (__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ + (__v4df)(__m256d)(B), (int)(imm)) + +#define _mm512_mask_insertf64x4(W, U, A, B, imm) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)(W)); }) + (__v8df)(__m512d)(W)) -#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_insertf64x4(U, A, B, imm) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd()); }) - -#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \ - (__v8di)_mm512_castsi256_si512((__m256i)(B)), \ - ((imm) & 0x1) ? 0 : 8, \ - ((imm) & 0x1) ? 1 : 9, \ - ((imm) & 0x1) ? 2 : 10, \ - ((imm) & 0x1) ? 3 : 11, \ - ((imm) & 0x1) ? 8 : 4, \ - ((imm) & 0x1) ? 9 : 5, \ - ((imm) & 0x1) ? 10 : 6, \ - ((imm) & 0x1) ? 11 : 7); }) - -#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \ + (__v8df)_mm512_setzero_pd()) + +#define _mm512_inserti64x4(A, B, imm) \ + (__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ + (__v4di)(__m256i)(B), (int)(imm)) + +#define _mm512_mask_inserti64x4(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)(W)); }) + (__v8di)(__m512i)(W)) -#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_inserti64x4(U, A, B, imm) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512()); }) - -#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \ - (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_castps128_ps512((__m128)(B)),\ - (((imm) & 0x3) == 0) ? 16 : 0, \ - (((imm) & 0x3) == 0) ? 17 : 1, \ - (((imm) & 0x3) == 0) ? 18 : 2, \ - (((imm) & 0x3) == 0) ? 19 : 3, \ - (((imm) & 0x3) == 1) ? 16 : 4, \ - (((imm) & 0x3) == 1) ? 17 : 5, \ - (((imm) & 0x3) == 1) ? 18 : 6, \ - (((imm) & 0x3) == 1) ? 19 : 7, \ - (((imm) & 0x3) == 2) ? 16 : 8, \ - (((imm) & 0x3) == 2) ? 17 : 9, \ - (((imm) & 0x3) == 2) ? 18 : 10, \ - (((imm) & 0x3) == 2) ? 19 : 11, \ - (((imm) & 0x3) == 3) ? 16 : 12, \ - (((imm) & 0x3) == 3) ? 17 : 13, \ - (((imm) & 0x3) == 3) ? 18 : 14, \ - (((imm) & 0x3) == 3) ? 19 : 15); }) - -#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \ + (__v8di)_mm512_setzero_si512()) + +#define _mm512_insertf32x4(A, B, imm) \ + (__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ + (__v4sf)(__m128)(B), (int)(imm)) + +#define _mm512_mask_insertf32x4(W, U, A, B, imm) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)(W)); }) + (__v16sf)(__m512)(W)) -#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_insertf32x4(U, A, B, imm) \ (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps()); }) - -#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ - (__v16si)_mm512_castsi128_si512((__m128i)(B)),\ - (((imm) & 0x3) == 0) ? 16 : 0, \ - (((imm) & 0x3) == 0) ? 17 : 1, \ - (((imm) & 0x3) == 0) ? 18 : 2, \ - (((imm) & 0x3) == 0) ? 19 : 3, \ - (((imm) & 0x3) == 1) ? 16 : 4, \ - (((imm) & 0x3) == 1) ? 17 : 5, \ - (((imm) & 0x3) == 1) ? 18 : 6, \ - (((imm) & 0x3) == 1) ? 19 : 7, \ - (((imm) & 0x3) == 2) ? 16 : 8, \ - (((imm) & 0x3) == 2) ? 17 : 9, \ - (((imm) & 0x3) == 2) ? 18 : 10, \ - (((imm) & 0x3) == 2) ? 19 : 11, \ - (((imm) & 0x3) == 3) ? 16 : 12, \ - (((imm) & 0x3) == 3) ? 17 : 13, \ - (((imm) & 0x3) == 3) ? 18 : 14, \ - (((imm) & 0x3) == 3) ? 19 : 15); }) - -#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \ + (__v16sf)_mm512_setzero_ps()) + +#define _mm512_inserti32x4(A, B, imm) \ + (__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ + (__v4si)(__m128i)(B), (int)(imm)) + +#define _mm512_mask_inserti32x4(W, U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)(W)); }) + (__v16si)(__m512i)(W)) -#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \ +#define _mm512_maskz_inserti32x4(U, A, B, imm) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512()); }) + (__v16si)_mm512_setzero_si512()) -#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \ +#define _mm512_getmant_round_pd(A, B, C, R) \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ (int)(((C)<<2) | (B)), \ (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \ +#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ (int)(((C)<<2) | (B)), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ (int)(((C)<<2) | (B)), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_getmant_pd(A, B, C) __extension__ ({ \ +#define _mm512_getmant_pd(A, B, C) \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ (int)(((C)<<2) | (B)), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \ +#define _mm512_mask_getmant_pd(W, U, A, B, C) \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ (int)(((C)<<2) | (B)), \ (__v8df)(__m512d)(W), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \ +#define _mm512_maskz_getmant_pd(U, A, B, C) \ (__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ (int)(((C)<<2) | (B)), \ (__v8df)_mm512_setzero_pd(), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \ +#define _mm512_getmant_round_ps(A, B, C, R) \ (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ (int)(((C)<<2) | (B)), \ (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \ +#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ (int)(((C)<<2) | (B)), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \ +#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ (int)(((C)<<2) | (B)), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_getmant_ps(A, B, C) __extension__ ({ \ +#define _mm512_getmant_ps(A, B, C) \ (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ (int)(((C)<<2)|(B)), \ (__v16sf)_mm512_undefined_ps(), \ (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \ +#define _mm512_mask_getmant_ps(W, U, A, B, C) \ (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ (int)(((C)<<2)|(B)), \ (__v16sf)(__m512)(W), \ (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \ +#define _mm512_maskz_getmant_ps(U, A, B, C) \ (__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ (int)(((C)<<2)|(B)), \ (__v16sf)_mm512_setzero_ps(), \ (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm512_getexp_round_pd(A, R) __extension__ ({ \ +#define _mm512_getexp_round_pd(A, R) \ (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_getexp_round_pd(W, U, A, R) \ (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \ +#define _mm512_maskz_getexp_round_pd(U, A, R) \ (__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_getexp_pd (__m512d __A) { return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, @@ -7975,7 +7531,7 @@ _mm512_getexp_pd (__m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, @@ -7984,7 +7540,7 @@ _mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, @@ -7993,22 +7549,22 @@ _mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_getexp_round_ps(A, R) __extension__ ({ \ +#define _mm512_getexp_round_ps(A, R) \ (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R)); }) + (__mmask16)-1, (int)(R)) -#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_getexp_round_ps(W, U, A, R) \ (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \ +#define _mm512_maskz_getexp_round_ps(U, A, R) \ (__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R)); }) + (__mmask16)(U), (int)(R)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_getexp_ps (__m512 __A) { return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, @@ -8017,7 +7573,7 @@ _mm512_getexp_ps (__m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, @@ -8026,7 +7582,7 @@ _mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, @@ -8035,802 +7591,812 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) _MM_FROUND_CUR_DIRECTION); } -#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \ +#define _mm512_i64gather_ps(index, addr, scale) \ (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ (float const *)(addr), \ (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale)); }) + (int)(scale)) -#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\ +#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ (__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ (float const *)(addr), \ (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\ - (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \ +#define _mm512_i64gather_epi32(index, addr, scale) \ + (__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ (int const *)(addr), \ (__v8di)(__m512i)(index), \ - (__mmask8)-1, (int)(scale)); }) + (__mmask8)-1, (int)(scale)) -#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ (int const *)(addr), \ (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\ +#define _mm512_i64gather_pd(index, addr, scale) \ (__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ (double const *)(addr), \ (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale)); }) + (int)(scale)) -#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ (__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ (double const *)(addr), \ (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\ - (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \ +#define _mm512_i64gather_epi64(index, addr, scale) \ + (__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ (long long const *)(addr), \ (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale)); }) + (int)(scale)) -#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ (__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ (long long const *)(addr), \ (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\ +#define _mm512_i32gather_ps(index, addr, scale) \ (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ (float const *)(addr), \ (__v16sf)(__m512)(index), \ - (__mmask16)-1, (int)(scale)); }) + (__mmask16)-1, (int)(scale)) -#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ (float const *)(addr), \ (__v16sf)(__m512)(index), \ - (__mmask16)(mask), (int)(scale)); }) + (__mmask16)(mask), (int)(scale)) -#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\ +#define _mm512_i32gather_epi32(index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ (int const *)(addr), \ (__v16si)(__m512i)(index), \ - (__mmask16)-1, (int)(scale)); }) + (__mmask16)-1, (int)(scale)) -#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ (int const *)(addr), \ (__v16si)(__m512i)(index), \ - (__mmask16)(mask), (int)(scale)); }) + (__mmask16)(mask), (int)(scale)) -#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\ +#define _mm512_i32gather_pd(index, addr, scale) \ (__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ (double const *)(addr), \ (__v8si)(__m256i)(index), (__mmask8)-1, \ - (int)(scale)); }) + (int)(scale)) -#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ (__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ (double const *)(addr), \ (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\ +#define _mm512_i32gather_epi64(index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ (long long const *)(addr), \ (__v8si)(__m256i)(index), (__mmask8)-1, \ - (int)(scale)); }) + (int)(scale)) -#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ (__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ (long long const *)(addr), \ (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i64scatter_ps(addr, index, v1, scale) \ __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ - (__v8sf)(__m256)(v1), (int)(scale)); }) + (__v8sf)(__m256)(v1), (int)(scale)) -#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ - (__v8sf)(__m256)(v1), (int)(scale)); }) + (__v8sf)(__m256)(v1), (int)(scale)) -#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i64scatter_epi32(addr, index, v1, scale) \ __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)); }) + (__v8si)(__m256i)(v1), (int)(scale)) -#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)); }) + (__v8si)(__m256i)(v1), (int)(scale)) -#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i64scatter_pd(addr, index, v1, scale) \ __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)); }) + (__v8df)(__m512d)(v1), (int)(scale)) -#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)); }) + (__v8df)(__m512d)(v1), (int)(scale)) -#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i64scatter_epi64(addr, index, v1, scale) \ __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \ (__v8di)(__m512i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)); }) + (__v8di)(__m512i)(v1), (int)(scale)) -#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \ (__v8di)(__m512i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)); }) + (__v8di)(__m512i)(v1), (int)(scale)) -#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i32scatter_ps(addr, index, v1, scale) \ __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \ (__v16si)(__m512i)(index), \ - (__v16sf)(__m512)(v1), (int)(scale)); }) + (__v16sf)(__m512)(v1), (int)(scale)) -#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ __builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \ (__v16si)(__m512i)(index), \ - (__v16sf)(__m512)(v1), (int)(scale)); }) + (__v16sf)(__m512)(v1), (int)(scale)) -#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i32scatter_epi32(addr, index, v1, scale) \ __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \ (__v16si)(__m512i)(index), \ - (__v16si)(__m512i)(v1), (int)(scale)); }) + (__v16si)(__m512i)(v1), (int)(scale)) -#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ __builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \ (__v16si)(__m512i)(index), \ - (__v16si)(__m512i)(v1), (int)(scale)); }) + (__v16si)(__m512i)(v1), (int)(scale)) -#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i32scatter_pd(addr, index, v1, scale) \ __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \ (__v8si)(__m256i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)); }) + (__v8df)(__m512d)(v1), (int)(scale)) -#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ __builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \ (__v8si)(__m256i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)); }) + (__v8df)(__m512d)(v1), (int)(scale)) -#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\ +#define _mm512_i32scatter_epi64(addr, index, v1, scale) \ __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \ (__v8si)(__m256i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)); }) + (__v8di)(__m512i)(v1), (int)(scale)) -#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\ +#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ __builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \ (__v8si)(__m256i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)); }) + (__v8di)(__m512i)(v1), (int)(scale)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + (__v4sf)__A, + (__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\ +#define _mm_fmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + (__v4sf)__B, + (__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\ +#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4sf)(__m128)(C), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, + (__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\ +#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - (__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + (__v4sf)__A, + -(__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\ +#define _mm_fmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)); }) + -(__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + (__v4sf)__B, + -(__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\ +#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ -(__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, + (__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\ +#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ (__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + -(__v4sf)__A, + (__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\ +#define _mm_fnmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ -(__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + -(__v4sf)__B, + (__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ +#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ (__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, + -(__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\ - (__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ +#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ + (__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, - -(__v4sf) __A, - -(__v4sf) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + -(__v4sf)__A, + -(__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\ +#define _mm_fnmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ -(__v4sf)(__m128)(A), \ -(__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + -(__v4sf)__B, + -(__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\ - (__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ +#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ -(__v4sf)(__m128)(C), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) { - return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W, - (__v4sf) __X, - (__v4sf) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, + -(__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\ - (__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ +#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ + (__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - (__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + (__v2df)__A, + (__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\ +#define _mm_fmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + (__v2df)__B, + (__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\ +#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2df)(__m128d)(C), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, - (__v2df) __X, - (__v2df) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, + (__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\ +#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - (__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + (__v2df)__A, + -(__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\ +#define _mm_fmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(A), \ -(__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + (__v2df)__B, + -(__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\ +#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ -(__v2df)(__m128d)(C), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, - (__v2df) __X, - (__v2df) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, + (__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\ +#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ (__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - -(__v2df) __A, - (__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + -(__v2df)__A, + (__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\ +#define _mm_fnmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ -(__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + -(__v2df)__B, + (__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ +#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ (__v2df)(__m128d)(C), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W, - (__v2df) __X, - (__v2df) __Y, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, + -(__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\ - (__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ +#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W, - -(__v2df) __A, - -(__v2df) __B, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + -(__v2df)__A, + -(__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\ +#define _mm_fnmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R)) + +#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ -(__v2df)(__m128d)(A), \ -(__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + -(__v2df)__B, + -(__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\ - (__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ +#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ -(__v2df)(__m128d)(C), \ (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION); }) + (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W), - (__v2df) __X, - (__v2df) (__Y), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, + -(__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); } -#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\ - (__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ +#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ + (__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), \ - (__mmask8)(U), (int)(R)); }) - -#define _mm512_permutex_pd(X, C) __extension__ ({ \ - (__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \ - (__v8df)_mm512_undefined_pd(), \ - 0 + (((C) >> 0) & 0x3), \ - 0 + (((C) >> 2) & 0x3), \ - 0 + (((C) >> 4) & 0x3), \ - 0 + (((C) >> 6) & 0x3), \ - 4 + (((C) >> 0) & 0x3), \ - 4 + (((C) >> 2) & 0x3), \ - 4 + (((C) >> 4) & 0x3), \ - 4 + (((C) >> 6) & 0x3)); }) - -#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \ + (__mmask8)(U), (int)(R)) + +#define _mm512_permutex_pd(X, C) \ + (__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)) + +#define _mm512_mask_permutex_pd(W, U, X, C) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)(__m512d)(W)); }) + (__v8df)(__m512d)(W)) -#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \ +#define _mm512_maskz_permutex_pd(U, X, C) \ (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd()); }) - -#define _mm512_permutex_epi64(X, C) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \ - (__v8di)_mm512_undefined_epi32(), \ - 0 + (((C) >> 0) & 0x3), \ - 0 + (((C) >> 2) & 0x3), \ - 0 + (((C) >> 4) & 0x3), \ - 0 + (((C) >> 6) & 0x3), \ - 4 + (((C) >> 0) & 0x3), \ - 4 + (((C) >> 2) & 0x3), \ - 4 + (((C) >> 4) & 0x3), \ - 4 + (((C) >> 6) & 0x3)); }) - -#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \ + (__v8df)_mm512_setzero_pd()) + +#define _mm512_permutex_epi64(X, C) \ + (__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)) + +#define _mm512_mask_permutex_epi64(W, U, X, C) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)(__m512i)(W)); }) + (__v8di)(__m512i)(W)) -#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \ +#define _mm512_maskz_permutex_epi64(U, X, C) \ (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)_mm512_setzero_si512()); }) + (__v8di)_mm512_setzero_si512()) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_permutexvar_pd (__m512i __X, __m512d __Y) { - return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, - (__v8di) __X, - (__v8df) _mm512_undefined_pd (), - (__mmask8) -1); + return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) { - return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, - (__v8di) __X, - (__v8df) __W, - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutexvar_pd(__X, __Y), + (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) { - return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, - (__v8di) __X, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutexvar_pd(__X, __Y), + (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, - (__v8di) __X, - (__v8di) _mm512_setzero_si512 (), - __M); + return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, - (__v8di) __X, - (__v8di) _mm512_undefined_epi32 (), - (__mmask8) -1); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_permutexvar_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, - (__v8di) __X, - (__v8di) __W, - __M); + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_permutexvar_epi64(__X, __Y), + (__v8di)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_permutexvar_ps (__m512i __X, __m512 __Y) { - return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, - (__v16si) __X, - (__v16sf) _mm512_undefined_ps (), - (__mmask16) -1); + return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) { - return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, - (__v16si) __X, - (__v16sf) __W, - (__mmask16) __U); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutexvar_ps(__X, __Y), + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) { - return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, - (__v16si) __X, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) -{ - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, - (__v16si) __X, - (__v16si) _mm512_setzero_si512 (), - __M); + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutexvar_ps(__X, __Y), + (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, - (__v16si) __X, - (__v16si) _mm512_undefined_epi32 (), - (__mmask16) -1); + return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); } #define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_permutexvar_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, - (__v16si) __X, - (__v16si) __W, - __M); + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_permutexvar_epi32(__X, __Y), + (__v16si)__W); } #define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kand (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kandn (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kor (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_kortestc (__mmask16 __A, __mmask16 __B) { return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_kortestz (__mmask16 __A, __mmask16 __B) { return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kunpackb (__mmask16 __A, __mmask16 __B) { - return (__mmask16) (( __A & 0xFF) | ( __B << 8)); + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kxnor (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_kxor (__mmask16 __A, __mmask16 __B) { return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_stream_si512 (__m512i * __P, __m512i __A) { typedef __v8di __v8di_aligned __attribute__((aligned(64))); __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_stream_load_si512 (void const *__P) { typedef __v8di __v8di_aligned __attribute__((aligned(64))); return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_stream_pd (double *__P, __m512d __A) { typedef __v8df __v8df_aligned __attribute__((aligned(64))); __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_stream_ps (float *__P, __m512 __A) { typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, @@ -8838,7 +8404,7 @@ _mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, @@ -8847,7 +8413,7 @@ _mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, @@ -8855,7 +8421,7 @@ _mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) { return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, @@ -8864,7 +8430,7 @@ _mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) (__mmask8) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, @@ -8872,7 +8438,7 @@ _mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, @@ -8881,7 +8447,7 @@ _mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, @@ -8889,7 +8455,7 @@ _mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) { return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, @@ -8898,116 +8464,116 @@ _mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) (__mmask16) __U); } -#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \ +#define _mm_cmp_round_ss_mask(X, Y, P, R) \ (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \ +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) -#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \ +#define _mm_cmp_ss_mask(X, Y, P) \ (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (int)(P), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \ +#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ (__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ (__v4sf)(__m128)(Y), (int)(P), \ (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \ +#define _mm_cmp_round_sd_mask(X, Y, P, R) \ (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \ +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)(M), (int)(R)); }) + (__mmask8)(M), (int)(R)) -#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \ +#define _mm_cmp_sd_mask(X, Y, P) \ (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (int)(P), \ (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) -#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \ +#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ (__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), (int)(P), \ (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION); }) + _MM_FROUND_CUR_DIRECTION) /* Bit Test */ -static __inline __mmask16 __DEFAULT_FN_ATTRS +static __inline __mmask16 __DEFAULT_FN_ATTRS512 _mm512_test_epi32_mask (__m512i __A, __m512i __B) { return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline __mmask8 __DEFAULT_FN_ATTRS +static __inline __mmask8 __DEFAULT_FN_ATTRS512 _mm512_test_epi64_mask (__m512i __A, __m512i __B) { return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 _mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_testn_epi32_mask (__m512i __A, __m512i __B) { return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 _mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 _mm512_testn_epi64_mask (__m512i __A, __m512i __B) { return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 _mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) { return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_epi32()); + _mm512_setzero_si512()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_movehdup_ps (__m512 __A) { return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -9015,7 +8581,7 @@ _mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -9023,14 +8589,14 @@ _mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_moveldup_ps (__m512 __A) { return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -9038,7 +8604,7 @@ _mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) { return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, @@ -9046,132 +8612,94 @@ _mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) (__v16sf)_mm512_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - __m128 res = __A; - res[0] = (__U & 1) ? __B[0] : __W[0]; - return res; + return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) { - __m128 res = __A; - res[0] = (__U & 1) ? __B[0] : 0; - return res; + return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), + _mm_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - __m128d res = __A; - res[0] = (__U & 1) ? __B[0] : __W[0]; - return res; + return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) { - __m128d res = __A; - res[0] = (__U & 1) ? __B[0] : 0; - return res; + return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), + _mm_setzero_pd()); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) { - __builtin_ia32_storess128_mask ((__v16sf *)__W, - (__v16sf) _mm512_castps128_ps512(__A), - (__mmask16) __U & (__mmask16)1); + __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) { - __builtin_ia32_storesd128_mask ((__v8df *)__W, - (__v8df) _mm512_castpd128_pd512(__A), - (__mmask8) __U & 1); + __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) { __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, - (__v4sf) {0.0, 0.0, 0.0, 0.0}, + (__v4sf)_mm_setzero_ps(), 0, 4, 4, 4); - return (__m128) __builtin_shufflevector( - __builtin_ia32_loadss128_mask ((__v16sf *) __A, - (__v16sf) _mm512_castps128_ps512(src), - (__mmask16) __U & 1), - _mm512_undefined_ps(), 0, 1, 2, 3); + return (__m128) __builtin_ia32_loadss128_mask ((__v4sf *) __A, src, __U & 1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_load_ss (__mmask8 __U, const float* __A) { - return (__m128) __builtin_shufflevector( - __builtin_ia32_loadss128_mask ((__v16sf *) __A, - (__v16sf) _mm512_setzero_ps(), - (__mmask16) __U & 1), - _mm512_undefined_ps(), 0, 1, 2, 3); + return (__m128)__builtin_ia32_loadss128_mask ((__v4sf *) __A, + (__v4sf) _mm_setzero_ps(), + __U & 1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) { __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, - (__v2df) {0.0, 0.0}, 0, 2); + (__v2df)_mm_setzero_pd(), + 0, 2); - return (__m128d) __builtin_shufflevector( - __builtin_ia32_loadsd128_mask ((__v8df *) __A, - (__v8df) _mm512_castpd128_pd512(src), - (__mmask8) __U & 1), - _mm512_undefined_pd(), 0, 1); + return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A, src, __U & 1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_load_sd (__mmask8 __U, const double* __A) { - return (__m128d) __builtin_shufflevector( - __builtin_ia32_loadsd128_mask ((__v8df *) __A, - (__v8df) _mm512_setzero_pd(), - (__mmask8) __U & 1), - _mm512_undefined_pd(), 0, 1); -} - -#define _mm512_shuffle_epi32(A, I) __extension__ ({ \ - (__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \ - (__v16si)_mm512_undefined_epi32(), \ - 0 + (((I) >> 0) & 0x3), \ - 0 + (((I) >> 2) & 0x3), \ - 0 + (((I) >> 4) & 0x3), \ - 0 + (((I) >> 6) & 0x3), \ - 4 + (((I) >> 0) & 0x3), \ - 4 + (((I) >> 2) & 0x3), \ - 4 + (((I) >> 4) & 0x3), \ - 4 + (((I) >> 6) & 0x3), \ - 8 + (((I) >> 0) & 0x3), \ - 8 + (((I) >> 2) & 0x3), \ - 8 + (((I) >> 4) & 0x3), \ - 8 + (((I) >> 6) & 0x3), \ - 12 + (((I) >> 0) & 0x3), \ - 12 + (((I) >> 2) & 0x3), \ - 12 + (((I) >> 4) & 0x3), \ - 12 + (((I) >> 6) & 0x3)); }) - -#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \ + return (__m128d) __builtin_ia32_loadsd128_mask ((__v2df *) __A, + (__v2df) _mm_setzero_pd(), + __U & 1); +} + +#define _mm512_shuffle_epi32(A, I) \ + (__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)) + +#define _mm512_mask_shuffle_epi32(W, U, A, I) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)(__m512i)(W)); }) + (__v16si)(__m512i)(W)) -#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \ +#define _mm512_maskz_shuffle_epi32(U, A, I) \ (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)_mm512_setzero_si512()); }) + (__v16si)_mm512_setzero_si512()) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, @@ -9179,7 +8707,7 @@ _mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, @@ -9187,7 +8715,7 @@ _mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) { return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, @@ -9195,15 +8723,15 @@ _mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) { return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_pd (), + (__v8di) _mm512_setzero_si512 (), (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) { return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, @@ -9211,7 +8739,7 @@ _mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) { return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, @@ -9219,7 +8747,7 @@ _mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, @@ -9227,15 +8755,15 @@ _mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) { return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, - (__v8di) _mm512_setzero_pd(), + (__v8di) _mm512_setzero_si512(), (__mmask8) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) { return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, @@ -9243,7 +8771,7 @@ _mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) { return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, @@ -9251,7 +8779,7 @@ _mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, @@ -9259,15 +8787,15 @@ _mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) { return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, - (__v16si) _mm512_setzero_ps(), + (__v16si) _mm512_setzero_si512(), (__mmask16) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, @@ -9275,7 +8803,7 @@ _mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, @@ -9283,7 +8811,7 @@ _mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) { return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, @@ -9291,71 +8819,64 @@ _mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) (__mmask16) __U); } -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) { return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_ps(), + (__v16si) _mm512_setzero_si512(), (__mmask16) __U); } -#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \ +#define _mm512_cvt_roundps_pd(A, R) \ (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \ +#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \ +#define _mm512_maskz_cvt_roundps_pd(U, A, R) \ (__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtps_pd (__m256 __A) { - return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); + return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) { - return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtps_pd(__A), + (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) { - return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtps_pd(__A), + (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtpslo_pd (__m512 __A) { - return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); + return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) { - return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); + return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, @@ -9363,7 +8884,7 @@ _mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) (__v8df) __W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) { return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, @@ -9371,7 +8892,7 @@ _mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) (__v8df) _mm512_setzero_pd ()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, @@ -9379,7 +8900,7 @@ _mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) (__v16sf) __W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) { return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, @@ -9387,68 +8908,68 @@ _mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) (__v16sf) _mm512_setzero_ps ()); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) { __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) { __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) { __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, (__mmask16) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) { __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, (__mmask16) __U); } -#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundsd_ss(A, B, R) \ (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ (__v2df)(__m128d)(B), \ (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ (__v2df)(__m128d)(B), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ (__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ (__v2df)(__m128d)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) { - return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A), - (__v2df)(__B), - (__v4sf)(__W), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, + (__v2df)__B, + (__v4sf)__W, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) { - return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A), - (__v2df)(__B), + return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, + (__v2df)__B, (__v4sf)_mm_setzero_ps(), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } #define _mm_cvtss_i32 _mm_cvtss_si32 @@ -9463,111 +8984,112 @@ _mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) #endif #ifdef __x86_64__ -#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundi64_sd(A, B, R) \ (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ - (int)(R)); }) + (int)(R)) -#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundsi64_sd(A, B, R) \ (__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ - (int)(R)); }) + (int)(R)) #endif -#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \ - (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); }) +#define _mm_cvt_roundsi32_ss(A, B, R) \ + (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) -#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \ - (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); }) +#define _mm_cvt_roundi32_ss(A, B, R) \ + (__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)) #ifdef __x86_64__ -#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundsi64_ss(A, B, R) \ (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ - (int)(R)); }) + (int)(R)) -#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundi64_ss(A, B, R) \ (__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ - (int)(R)); }) + (int)(R)) #endif -#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundss_sd(A, B, R) \ (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ (__v4sf)(__m128)(B), \ (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1, (int)(R)); }) + (__mmask8)-1, (int)(R)) -#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \ +#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ (__v4sf)(__m128)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \ +#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ (__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ (__v4sf)(__m128)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R)); }) + (__mmask8)(U), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) { - return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A), - (__v4sf)(__B), - (__v2df)(__W), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, + (__v4sf)__B, + (__v2df)__W, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) { - return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A), - (__v4sf)(__B), - (__v2df)_mm_setzero_pd(), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); + return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, + (__v4sf)__B, + (__v2df)_mm_setzero_pd(), + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtu32_sd (__m128d __A, unsigned __B) { - return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); + __A[0] = __B; + return __A; } #ifdef __x86_64__ -#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundu64_sd(A, B, R) \ (__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ - (unsigned long long)(B), (int)(R)); }) + (unsigned long long)(B), (int)(R)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtu64_sd (__m128d __A, unsigned long long __B) { - return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, - _MM_FROUND_CUR_DIRECTION); + __A[0] = __B; + return __A; } #endif -#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundu32_ss(A, B, R) \ (__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ - (int)(R)); }) + (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtu32_ss (__m128 __A, unsigned __B) { - return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, - _MM_FROUND_CUR_DIRECTION); + __A[0] = __B; + return __A; } #ifdef __x86_64__ -#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \ +#define _mm_cvt_roundu64_ss(A, B, R) \ (__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ - (unsigned long long)(B), (int)(R)); }) + (unsigned long long)(B), (int)(R)) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtu64_ss (__m128 __A, unsigned long long __B) { - return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, - _MM_FROUND_CUR_DIRECTION); + __A[0] = __B; + return __A; } #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) { return (__m512i) __builtin_ia32_selectd_512(__M, @@ -9575,17 +9097,15 @@ _mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) (__v16si) __O); } -#ifdef __x86_64__ -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) { return (__m512i) __builtin_ia32_selectq_512(__M, (__v8di) _mm512_set1_epi64(__A), (__v8di) __O); } -#endif -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, @@ -9609,7 +9129,7 @@ _mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, short __e27, short __e26, short __e25, short __e24, short __e23, short __e22, short __e21, short __e20, short __e19, short __e18, @@ -9624,7 +9144,7 @@ _mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; } -static __inline __m512i __DEFAULT_FN_ATTRS +static __inline __m512i __DEFAULT_FN_ATTRS512 _mm512_set_epi32 (int __A, int __B, int __C, int __D, int __E, int __F, int __G, int __H, int __I, int __J, int __K, int __L, @@ -9640,7 +9160,7 @@ _mm512_set_epi32 (int __A, int __B, int __C, int __D, _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ (e5),(e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512i __DEFAULT_FN_ATTRS +static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_set_epi64 (long long __A, long long __B, long long __C, long long __D, long long __E, long long __F, long long __G, long long __H) @@ -9652,7 +9172,7 @@ _mm512_set_epi64 (long long __A, long long __B, long long __C, #define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_set_pd (double __A, double __B, double __C, double __D, double __E, double __F, double __G, double __H) { @@ -9663,7 +9183,7 @@ _mm512_set_pd (double __A, double __B, double __C, double __D, #define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_set_ps (float __A, float __B, float __C, float __D, float __E, float __F, float __G, float __H, float __I, float __J, float __K, float __L, @@ -9678,556 +9198,401 @@ _mm512_set_ps (float __A, float __B, float __C, float __D, _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ (e4),(e3),(e2),(e1),(e0)) -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_abs_ps(__m512 __A) { return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } -static __inline__ __m512 __DEFAULT_FN_ATTRS +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) { return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_abs_pd(__m512d __A) { return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; } -static __inline__ __m512d __DEFAULT_FN_ATTRS +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) { return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); } -// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as -// outputs. This class of vector operation forms the basis of many scientific -// computations. In vector-reduction arithmetic, the evaluation off is -// independent of the order of the input elements of V. - -// Used bisection method. At each step, we partition the vector with previous -// step in half, and the operation is performed on its two halves. -// This takes log2(n) steps where n is the number of elements in the vector. - -// Vec512 - Vector with size 512. -// Operator - Can be one of following: +,*,&,| -// T2 - Can get 'i' for int and 'f' for float. -// T1 - Can get 'i' for int and 'd' for double. - -#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \ - __extension__({ \ - __m256##T1 Vec256 = __builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 0, 1, 2, 3) \ - Operator \ - __builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 4, 5, 6, 7); \ - __m128##T1 Vec128 = __builtin_shufflevector( \ - (__v4d##T2)Vec256, \ - (__v4d##T2)Vec256, \ - 0, 1) \ - Operator \ - __builtin_shufflevector( \ - (__v4d##T2)Vec256, \ - (__v4d##T2)Vec256, \ - 2, 3); \ - Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \ - (__v2d##T2)Vec128, 0, -1) \ - Operator \ - __builtin_shufflevector((__v2d##T2)Vec128, \ - (__v2d##T2)Vec128, 1, -1); \ - return Vec128[0]; \ - }) - -static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, +, i, i); -} - -static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, *, i, i); -} - -static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, &, i, i); -} - -static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) { - _mm512_reduce_operator_64bit(__W, |, i, i); -} - -static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) { - _mm512_reduce_operator_64bit(__W, +, f, d); -} - -static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) { - _mm512_reduce_operator_64bit(__W, *, f, d); -} - -// Vec512 - Vector with size 512. -// Vec512Neutral - All vector elements set to the identity element. -// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{|,0} -// Operator - Can be one of following: +,*,&,| -// Mask - Intrinsic Mask -// T2 - Can get 'i' for int and 'f' for float. -// T1 - Can get 'i' for int and 'd' for packed double-precision. -// T3 - Can be Pd for packed double or q for q-word. - -#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \ - Mask, T2, T1, T3) \ - __extension__({ \ - Vec512 = __builtin_ia32_select##T3##_512( \ - (__mmask8)Mask, \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512Neutral); \ - _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \ - }) - -static __inline__ long long __DEFAULT_FN_ATTRS +/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as + * outputs. This class of vector operation forms the basis of many scientific + * computations. In vector-reduction arithmetic, the evaluation off is + * independent of the order of the input elements of V. + + * Used bisection method. At each step, we partition the vector with previous + * step in half, and the operation is performed on its two halves. + * This takes log2(n) steps where n is the number of elements in the vector. + */ + +#define _mm512_mask_reduce_operator(op) \ + __v4du __t1 = (__v4du)_mm512_extracti64x4_epi64(__W, 0); \ + __v4du __t2 = (__v4du)_mm512_extracti64x4_epi64(__W, 1); \ + __m256i __t3 = (__m256i)(__t1 op __t2); \ + __v2du __t4 = (__v2du)_mm256_extracti128_si256(__t3, 0); \ + __v2du __t5 = (__v2du)_mm256_extracti128_si256(__t3, 1); \ + __v2du __t6 = __t4 op __t5; \ + __v2du __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ + __v2du __t8 = __t6 op __t7; \ + return __t8[0]; + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { + _mm512_mask_reduce_operator(+); +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { + _mm512_mask_reduce_operator(*); +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { + _mm512_mask_reduce_operator(&); +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { + _mm512_mask_reduce_operator(|); +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q); + __W = _mm512_maskz_mov_epi64(__M, __W); + _mm512_mask_reduce_operator(+); } -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q); + __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); + _mm512_mask_reduce_operator(*); } -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), - &, __M, i, i, q); + __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); + _mm512_mask_reduce_operator(&); } -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), |, __M, - i, i, q); + __W = _mm512_maskz_mov_epi64(__M, __W); + _mm512_mask_reduce_operator(|); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __m256d __t1 = _mm512_extractf64x4_pd(__W, 0); \ + __m256d __t2 = _mm512_extractf64x4_pd(__W, 1); \ + __m256d __t3 = __t1 op __t2; \ + __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ + __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ + __m128d __t6 = __t4 op __t5; \ + __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ + __m128d __t8 = __t6 op __t7; \ + return __t8[0]; + +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { + _mm512_mask_reduce_operator(+); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { + _mm512_mask_reduce_operator(*); } -static __inline__ double __DEFAULT_FN_ATTRS +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M, - f, d, pd); + __W = _mm512_maskz_mov_pd(__M, __W); + _mm512_mask_reduce_operator(+); } -static __inline__ double __DEFAULT_FN_ATTRS +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { - _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M, - f, d, pd); -} - -// Vec512 - Vector with size 512. -// Operator - Can be one of following: +,*,&,| -// T2 - Can get 'i' for int and ' ' for packed single. -// T1 - Can get 'i' for int and 'f' for float. - -#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \ - __m256##T1 Vec256 = \ - (__m256##T1)(__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 0, 1, 2, 3, 4, 5, 6, 7) \ - Operator \ - __builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 8, 9, 10, 11, 12, 13, 14, 15)); \ - __m128##T1 Vec128 = \ - (__m128##T1)(__builtin_shufflevector( \ - (__v8s##T2)Vec256, \ - (__v8s##T2)Vec256, \ - 0, 1, 2, 3) \ - Operator \ - __builtin_shufflevector( \ - (__v8s##T2)Vec256, \ - (__v8s##T2)Vec256, \ - 4, 5, 6, 7)); \ - Vec128 = (__m128##T1)(__builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 0, 1, -1, -1) \ - Operator \ - __builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 2, 3, -1, -1)); \ - Vec128 = (__m128##T1)(__builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 0, -1, -1, -1) \ - Operator \ - __builtin_shufflevector( \ - (__v4s##T2)Vec128, \ - (__v4s##T2)Vec128, \ - 1, -1, -1, -1)); \ - return Vec128[0]; \ - }) - -static __inline__ int __DEFAULT_FN_ATTRS + __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); + _mm512_mask_reduce_operator(*); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __v8su __t1 = (__v8su)_mm512_extracti64x4_epi64(__W, 0); \ + __v8su __t2 = (__v8su)_mm512_extracti64x4_epi64(__W, 1); \ + __m256i __t3 = (__m256i)(__t1 op __t2); \ + __v4su __t4 = (__v4su)_mm256_extracti128_si256(__t3, 0); \ + __v4su __t5 = (__v4su)_mm256_extracti128_si256(__t3, 1); \ + __v4su __t6 = __t4 op __t5; \ + __v4su __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ + __v4su __t8 = __t6 op __t7; \ + __v4su __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ + __v4su __t10 = __t8 op __t9; \ + return __t10[0]; + +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, +, i, i); + _mm512_mask_reduce_operator(+); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, *, i, i); + _mm512_mask_reduce_operator(*); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, &, i, i); + _mm512_mask_reduce_operator(&); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi32(__m512i __W) { - _mm512_reduce_operator_32bit(__W, |, i, i); + _mm512_mask_reduce_operator(|); } -static __inline__ float __DEFAULT_FN_ATTRS -_mm512_reduce_add_ps(__m512 __W) { - _mm512_reduce_operator_32bit(__W, +, f, ); -} - -static __inline__ float __DEFAULT_FN_ATTRS -_mm512_reduce_mul_ps(__m512 __W) { - _mm512_reduce_operator_32bit(__W, *, f, ); -} - -// Vec512 - Vector with size 512. -// Vec512Neutral - All vector elements set to the identity element. -// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{|,0} -// Operator - Can be one of following: +,*,&,| -// Mask - Intrinsic Mask -// T2 - Can get 'i' for int and 'f' for float. -// T1 - Can get 'i' for int and 'd' for double. -// T3 - Can be Ps for packed single or d for d-word. - -#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \ - Mask, T2, T1, T3) \ - __extension__({ \ - Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ - (__mmask16)Mask, \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512Neutral); \ - _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \ - }) - -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d); + __W = _mm512_maskz_mov_epi32(__M, __W); + _mm512_mask_reduce_operator(+); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d); + __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); + _mm512_mask_reduce_operator(*); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M, - i, i, d); + __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); + _mm512_mask_reduce_operator(&); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), |, __M, i, i, d); + __W = _mm512_maskz_mov_epi32(__M, __W); + _mm512_mask_reduce_operator(|); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 0); \ + __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__W, 1); \ + __m256 __t3 = __t1 op __t2; \ + __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ + __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ + __m128 __t6 = __t4 op __t5; \ + __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ + __m128 __t8 = __t6 op __t7; \ + __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ + __m128 __t10 = __t8 op __t9; \ + return __t10[0]; + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_add_ps(__m512 __W) { + _mm512_mask_reduce_operator(+); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_mul_ps(__m512 __W) { + _mm512_mask_reduce_operator(*); } -static __inline__ float __DEFAULT_FN_ATTRS +static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps); + __W = _mm512_maskz_mov_ps(__M, __W); + _mm512_mask_reduce_operator(+); } -static __inline__ float __DEFAULT_FN_ATTRS +static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { - _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps); -} - -// Used bisection method. At each step, we partition the vector with previous -// step in half, and the operation is performed on its two halves. -// This takes log2(n) steps where n is the number of elements in the vector. -// This macro uses only intrinsics from the AVX512F feature. - -// Vec512 - Vector with size of 512. -// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: -// __mm512_max_epi64 -// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] -// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] - -#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 0, 1, 2, 3, -1, -1, -1, -1), \ - (__m512##T1)__builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 4, 5, 6, 7, -1, -1, -1, -1)); \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 0, 1, -1, -1, -1, -1, -1, -1),\ - (__m512##T1)__builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 2, 3, -1, -1, -1, -1, -1, \ - -1)); \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 0, -1, -1, -1, -1, -1, -1, -1),\ - (__m512##T1)__builtin_shufflevector( \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512, \ - 1, -1, -1, -1, -1, -1, -1, -1))\ - ; \ - return Vec512[0]; \ - }) - -static __inline__ long long __DEFAULT_FN_ATTRS -_mm512_reduce_max_epi64(__m512i __V) { - _mm512_reduce_maxMin_64bit(__V, max_epi64, i, i); + __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); + _mm512_mask_reduce_operator(*); } +#undef _mm512_mask_reduce_operator -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_mm512_reduce_max_epu64(__m512i __V) { - _mm512_reduce_maxMin_64bit(__V, max_epu64, i, i); +#define _mm512_mask_reduce_operator(op) \ + __m512i __t1 = (__m512i)__builtin_shufflevector((__v8di)__V, (__v8di)__V, 4, 5, 6, 7, 0, 1, 2, 3); \ + __m512i __t2 = _mm512_##op(__V, __t1); \ + __m512i __t3 = (__m512i)__builtin_shufflevector((__v8di)__t2, (__v8di)__t2, 2, 3, 0, 1, 6, 7, 4, 5); \ + __m512i __t4 = _mm512_##op(__t2, __t3); \ + __m512i __t5 = (__m512i)__builtin_shufflevector((__v8di)__t4, (__v8di)__t4, 1, 0, 3, 2, 5, 4, 7, 6); \ + __v8di __t6 = (__v8di)_mm512_##op(__t4, __t5); \ + return __t6[0]; + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epi64(__m512i __V) { + _mm512_mask_reduce_operator(max_epi64); } -static __inline__ double __DEFAULT_FN_ATTRS -_mm512_reduce_max_pd(__m512d __V) { - _mm512_reduce_maxMin_64bit(__V, max_pd, d, f); +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epu64(__m512i __V) { + _mm512_mask_reduce_operator(max_epu64); } -static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64 -(__m512i __V) { - _mm512_reduce_maxMin_64bit(__V, min_epi64, i, i); +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epi64(__m512i __V) { + _mm512_mask_reduce_operator(min_epi64); } -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_reduce_min_epu64(__m512i __V) { - _mm512_reduce_maxMin_64bit(__V, min_epu64, i, i); + _mm512_mask_reduce_operator(min_epu64); } -static __inline__ double __DEFAULT_FN_ATTRS -_mm512_reduce_min_pd(__m512d __V) { - _mm512_reduce_maxMin_64bit(__V, min_pd, d, f); -} - -// Vec512 - Vector with size 512. -// Vec512Neutral - A 512 length vector with elements set to the identity element -// Identity element: {max_epi,0x8000000000000000} -// {max_epu,0x0000000000000000} -// {max_pd, 0xFFF0000000000000} -// {min_epi,0x7FFFFFFFFFFFFFFF} -// {min_epu,0xFFFFFFFFFFFFFFFF} -// {min_pd, 0x7FF0000000000000} -// -// IntrinName - Can be one of following: {max|min}_{epi64|epu64|pd} for example: -// __mm512_max_epi64 -// T1 - Can get 'i' for int and 'd' for double.[__m512{i|d}] -// T2 - Can get 'i' for int and 'f' for float. [__v8d{i|f}] -// T3 - Can get 'q' q word and 'pd' for packed double. -// [__builtin_ia32_select{q|pd}_512] -// Mask - Intrinsic Mask - -#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \ - T2, T3, Mask) \ - __extension__({ \ - Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ - (__mmask8)Mask, \ - (__v8d##T2)Vec512, \ - (__v8d##T2)Vec512Neutral); \ - _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \ - }) - -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000), - max_epi64, i, i, q, __M); + __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); + _mm512_mask_reduce_operator(max_epi64); } -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000), - max_epu64, i, i, q, __M); -} - -static __inline__ double __DEFAULT_FN_ATTRS -_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { - _mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()), - max_pd, d, f, pd, __M); + __V = _mm512_maskz_mov_epi64(__M, __V); + _mm512_mask_reduce_operator(max_epu64); } -static __inline__ long long __DEFAULT_FN_ATTRS +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF), - min_epi64, i, i, q, __M); + __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); + _mm512_mask_reduce_operator(min_epi64); } -static __inline__ unsigned long long __DEFAULT_FN_ATTRS +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF), - min_epu64, i, i, q, __M); + __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V); + _mm512_mask_reduce_operator(min_epu64); } +#undef _mm512_mask_reduce_operator -static __inline__ double __DEFAULT_FN_ATTRS -_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { - _mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()), - min_pd, d, f, pd, __M); -} - -// Vec512 - Vector with size 512. -// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: -// __mm512_max_epi32 -// T1 - Can get 'i' for int and ' ' .[__m512{i|}] -// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] - -#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 0, 1, 2, 3, 4, 5, 6, 7, \ - -1, -1, -1, -1, -1, -1, -1, -1), \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 8, 9, 10, 11, 12, 13, 14, 15, \ - -1, -1, -1, -1, -1, -1, -1, -1)); \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 0, 1, 2, 3, -1, -1, -1, -1, \ - -1, -1, -1, -1, -1, -1, -1, -1), \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 4, 5, 6, 7, -1, -1, -1, -1, \ - -1, -1, -1, -1, -1, -1, -1, -1)); \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 0, 1, -1, -1, -1, -1, -1, -1, \ - -1, -1, -1, -1, -1, -1, -1, -1), \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 2, 3, -1, -1, -1, -1, -1, -1, \ - -1, -1, -1, -1, -1, -1, -1, -1)); \ - Vec512 = _mm512_##IntrinName( \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 0, -1, -1, -1, -1, -1, -1, -1, \ - -1, -1, -1, -1, -1, -1, -1, -1), \ - (__m512##T1)__builtin_shufflevector( \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512, \ - 1, -1, -1, -1, -1, -1, -1, -1, \ - -1, -1, -1, -1, -1, -1, -1, -1)); \ - return Vec512[0]; \ - }) - -static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) { - _mm512_reduce_maxMin_32bit(a, max_epi32, i, i); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm512_reduce_max_epu32(__m512i a) { - _mm512_reduce_maxMin_32bit(a, max_epu32, i, i); -} - -static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) { - _mm512_reduce_maxMin_32bit(a, max_ps, , f); -} - -static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) { - _mm512_reduce_maxMin_32bit(a, min_epi32, i, i); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm512_reduce_min_epu32(__m512i a) { - _mm512_reduce_maxMin_32bit(a, min_epu32, i, i); -} - -static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) { - _mm512_reduce_maxMin_32bit(a, min_ps, , f); -} - -// Vec512 - Vector with size 512. -// Vec512Neutral - A 512 length vector with elements set to the identity element -// Identity element: {max_epi,0x80000000} -// {max_epu,0x00000000} -// {max_ps, 0xFF800000} -// {min_epi,0x7FFFFFFF} -// {min_epu,0xFFFFFFFF} -// {min_ps, 0x7F800000} -// -// IntrinName - Can be one of following: {max|min}_{epi32|epu32|ps} for example: -// __mm512_max_epi32 -// T1 - Can get 'i' for int and ' ' .[__m512{i|}] -// T2 - Can get 'i' for int and 'f' for float.[__v16s{i|f}] -// T3 - Can get 'q' q word and 'pd' for packed double. -// [__builtin_ia32_select{q|pd}_512] -// Mask - Intrinsic Mask - -#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \ - T2, T3, Mask) \ - __extension__({ \ - Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \ - (__mmask16)Mask, \ - (__v16s##T2)Vec512, \ - (__v16s##T2)Vec512Neutral); \ - _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \ - }) - -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32, - i, i, d, __M); +#define _mm512_mask_reduce_operator(op) \ + __m256i __t1 = _mm512_extracti64x4_epi64(__V, 0); \ + __m256i __t2 = _mm512_extracti64x4_epi64(__V, 1); \ + __m256i __t3 = _mm256_##op(__t1, __t2); \ + __m128i __t4 = _mm256_extracti128_si256(__t3, 0); \ + __m128i __t5 = _mm256_extracti128_si256(__t3, 1); \ + __m128i __t6 = _mm_##op(__t4, __t5); \ + __m128i __t7 = (__m128i)__builtin_shufflevector((__v4si)__t6, (__v4si)__t6, 2, 3, 0, 1); \ + __m128i __t8 = _mm_##op(__t6, __t7); \ + __m128i __t9 = (__m128i)__builtin_shufflevector((__v4si)__t8, (__v4si)__t8, 1, 0, 3, 2); \ + __v4si __t10 = (__v4si)_mm_##op(__t8, __t9); \ + return __t10[0]; + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epi32(__m512i __V) { + _mm512_mask_reduce_operator(max_epi32); } -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32, - i, i, d, __M); +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epu32(__m512i __V) { + _mm512_mask_reduce_operator(max_epu32); } -static __inline__ float __DEFAULT_FN_ATTRS -_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { - _mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f, - ps, __M); +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epi32(__m512i __V) { + _mm512_mask_reduce_operator(min_epi32); } -static __inline__ int __DEFAULT_FN_ATTRS +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epu32(__m512i __V) { + _mm512_mask_reduce_operator(min_epu32); +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { + __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); + _mm512_mask_reduce_operator(max_epi32); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { + __V = _mm512_maskz_mov_epi32(__M, __V); + _mm512_mask_reduce_operator(max_epu32); +} + +static __inline__ int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32, - i, i, d, __M); + __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); + _mm512_mask_reduce_operator(min_epi32); } -static __inline__ unsigned int __DEFAULT_FN_ATTRS +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { - _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32, - i, i, d, __M); + __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V); + _mm512_mask_reduce_operator(min_epu32); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \ + __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \ + __m256d __t3 = _mm256_##op(__t1, __t2); \ + __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \ + __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \ + __m128d __t6 = _mm_##op(__t4, __t5); \ + __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \ + __m128d __t8 = _mm_##op(__t6, __t7); \ + return __t8[0]; + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_pd(__m512d __V) { + _mm512_mask_reduce_operator(max_pd); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_pd(__m512d __V) { + _mm512_mask_reduce_operator(min_pd); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { + __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); + _mm512_mask_reduce_operator(max_pd); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { + __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); + _mm512_mask_reduce_operator(min_pd); +} +#undef _mm512_mask_reduce_operator + +#define _mm512_mask_reduce_operator(op) \ + __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \ + __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \ + __m256 __t3 = _mm256_##op(__t1, __t2); \ + __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \ + __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \ + __m128 __t6 = _mm_##op(__t4, __t5); \ + __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \ + __m128 __t8 = _mm_##op(__t6, __t7); \ + __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \ + __m128 __t10 = _mm_##op(__t8, __t9); \ + return __t10[0]; + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_ps(__m512 __V) { + _mm512_mask_reduce_operator(max_ps); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_ps(__m512 __V) { + _mm512_mask_reduce_operator(min_ps); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { + __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); + _mm512_mask_reduce_operator(max_ps); } -static __inline__ float __DEFAULT_FN_ATTRS +static __inline__ float __DEFAULT_FN_ATTRS512 _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { - _mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f, - ps, __M); + __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); + _mm512_mask_reduce_operator(min_ps); } +#undef _mm512_mask_reduce_operator -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512 +#undef __DEFAULT_FN_ATTRS128 -#endif // __AVX512FINTRIN_H +#endif /* __AVX512FINTRIN_H */ diff --git a/c_headers/avx512ifmaintrin.h b/c_headers/avx512ifmaintrin.h index 5defbaea8b..159713049c 100644 --- a/c_headers/avx512ifmaintrin.h +++ b/c_headers/avx512ifmaintrin.h @@ -29,62 +29,52 @@ #define __IFMAINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __Z, - (__mmask8) -1); + return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y, + (__v8di) __Z); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) +_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W, - (__v8di) __X, - (__v8di) __Y, - (__mmask8) __M); + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X, - (__v8di) __Y, - (__v8di) __Z, - (__mmask8) __M); + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X, - (__v8di) __Y, - (__v8di) __Z, - (__mmask8) -1); + return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y, + (__v8di) __Z); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) +_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) { - return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W, - (__v8di) __X, - (__v8di) __Y, - (__mmask8) __M); + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), + (__v8di)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) { - return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X, - (__v8di) __Y, - (__v8di) __Z, - (__mmask8) __M); + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512()); } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512ifmavlintrin.h b/c_headers/avx512ifmavlintrin.h index 131ee5cb4f..afdea888c5 100644 --- a/c_headers/avx512ifmavlintrin.h +++ b/c_headers/avx512ifmavlintrin.h @@ -29,121 +29,105 @@ #define __IFMAVLINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __Z, - (__mmask8) -1); + return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y, + (__v2di) __Z); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W, - (__v2di) __X, - (__v2di) __Y, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X, - (__v2di) __Y, - (__v2di) __Z, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __Z, - (__mmask8) -1); + return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W, - (__v4di) __X, - (__v4di) __Y, - (__mmask8) __M); + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X, - (__v4di) __Y, - (__v4di) __Z, - (__mmask8) __M); + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), + (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __X, - (__v2di) __Y, - (__v2di) __Z, - (__mmask8) -1); + return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, + (__v2di)__Z); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { - return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W, - (__v2di) __X, - (__v2di) __Y, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { - return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X, - (__v2di) __Y, - (__v2di) __Z, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __X, - (__v4di) __Y, - (__v4di) __Z, - (__mmask8) -1); + return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W, - (__v4di) __X, - (__v4di) __Y, - (__mmask8) __M); + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { - return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X, - (__v4di) __Y, - (__v4di) __Z, - (__mmask8) __M); + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), + (__v4di)_mm256_setzero_si256()); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avx512pfintrin.h b/c_headers/avx512pfintrin.h index c7fa3cf313..5b8260b77c 100644 --- a/c_headers/avx512pfintrin.h +++ b/c_headers/avx512pfintrin.h @@ -1,4 +1,4 @@ -/*===------------- avx512pfintrin.h - PF intrinsics ------------------=== +/*===------------- avx512pfintrin.h - PF intrinsics ------------------------=== * * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,80 +31,80 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf"))) -#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) __extension__ ({\ +#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ (long long const *)(addr), (int)(scale), \ - (int)(hint)); }) - -#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) __extension__ ({\ + (int)(hint)) + +#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \ __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \ (long long const *)(addr), (int)(scale), \ - (int)(hint)); }) + (int)(hint)) -#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) ({\ +#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfdps((__mmask16)(mask), \ (__v16si)(__m512i)(index), (int const *)(addr), \ - (int)(scale), (int)(hint)); }) + (int)(scale), (int)(hint)) -#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) ({\ +#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \ __builtin_ia32_gatherpfdps((__mmask16) -1, \ (__v16si)(__m512i)(index), (int const *)(addr), \ - (int)(scale), (int)(hint)); }) + (int)(scale), (int)(hint)) -#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) __extension__ ({\ +#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ (long long const *)(addr), (int)(scale), \ - (int)(hint)); }) + (int)(hint)) -#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) __extension__ ({\ +#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \ __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \ (long long const *)(addr), (int)(scale), \ - (int)(hint)); }) - -#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) ({\ + (int)(hint)) + +#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \ __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (int const *)(addr), (int)(scale), (int)(hint)); }) + (int const *)(addr), (int)(scale), (int)(hint)) -#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) ({\ +#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \ __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \ - (int const *)(addr), (int)(scale), (int)(hint)); }) + (int const *)(addr), (int)(scale), (int)(hint)) -#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) __extension__ ({\ +#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \ __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \ (long long *)(addr), (int)(scale), \ - (int)(hint)); }) + (int)(hint)) -#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) __extension__ ({\ +#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ (long long *)(addr), (int)(scale), \ - (int)(hint)); }) + (int)(hint)) -#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) __extension__ ({\ +#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \ __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \ - (int *)(addr), (int)(scale), (int)(hint)); }) + (int *)(addr), (int)(scale), (int)(hint)) -#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) __extension__ ({\ +#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfdps((__mmask16)(mask), \ (__v16si)(__m512i)(index), (int *)(addr), \ - (int)(scale), (int)(hint)); }) + (int)(scale), (int)(hint)) -#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) __extension__ ({\ +#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \ __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \ (long long *)(addr), (int)(scale), \ - (int)(hint)); }) + (int)(hint)) -#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) __extension__ ({\ +#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ (long long *)(addr), (int)(scale), \ - (int)(hint)); }) + (int)(hint)) -#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) __extension__ ({\ +#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \ __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \ - (int *)(addr), (int)(scale), (int)(hint)); }) + (int *)(addr), (int)(scale), (int)(hint)) -#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) __extension__ ({\ +#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \ __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (int *)(addr), (int)(scale), (int)(hint)); }) + (int *)(addr), (int)(scale), (int)(hint)) #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vbmi2intrin.h b/c_headers/avx512vbmi2intrin.h index 43e97b40a0..d2a58094fd 100644 --- a/c_headers/avx512vbmi2intrin.h +++ b/c_headers/avx512vbmi2intrin.h @@ -29,7 +29,7 @@ #define __AVX512VBMI2INTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS @@ -44,7 +44,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D) { return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), __U); } @@ -60,7 +60,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D) { return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), __U); } @@ -90,7 +90,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D) { return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), __U); } @@ -106,7 +106,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D) { return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), __U); } @@ -122,7 +122,7 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P) { return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, - (__v32hi) _mm512_setzero_hi(), + (__v32hi) _mm512_setzero_si512(), __U); } @@ -138,87 +138,93 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) { return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, - (__v64qi) _mm512_setzero_qi(), + (__v64qi) _mm512_setzero_si512(), __U); } -#define _mm512_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_vpshldq512_mask((__v8di)(A), \ - (__v8di)(B), \ - (int)(I), \ - (__v8di)(S), \ - (__mmask8)(U)); }) +#define _mm512_shldi_epi64(A, B, I) \ + (__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I)) + +#define _mm512_mask_shldi_epi64(S, U, A, B, I) \ + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S)) #define _mm512_maskz_shldi_epi64(U, A, B, I) \ - _mm512_mask_shldi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512()) -#define _mm512_shldi_epi64(A, B, I) \ - _mm512_mask_shldi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) +#define _mm512_shldi_epi32(A, B, I) \ + (__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I)) -#define _mm512_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_vpshldd512_mask((__v16si)(A), \ - (__v16si)(B), \ - (int)(I), \ - (__v16si)(S), \ - (__mmask16)(U)); }) +#define _mm512_mask_shldi_epi32(S, U, A, B, I) \ + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S)) #define _mm512_maskz_shldi_epi32(U, A, B, I) \ - _mm512_mask_shldi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512()) -#define _mm512_shldi_epi32(A, B, I) \ - _mm512_mask_shldi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) +#define _mm512_shldi_epi16(A, B, I) \ + (__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), (int)(I)) -#define _mm512_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_vpshldw512_mask((__v32hi)(A), \ - (__v32hi)(B), \ - (int)(I), \ - (__v32hi)(S), \ - (__mmask32)(U)); }) +#define _mm512_mask_shldi_epi16(S, U, A, B, I) \ + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S)) #define _mm512_maskz_shldi_epi16(U, A, B, I) \ - _mm512_mask_shldi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512()) -#define _mm512_shldi_epi16(A, B, I) \ - _mm512_mask_shldi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) +#define _mm512_shrdi_epi64(A, B, I) \ + (__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I)) -#define _mm512_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_vpshrdq512_mask((__v8di)(A), \ - (__v8di)(B), \ - (int)(I), \ - (__v8di)(S), \ - (__mmask8)(U)); }) +#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S)) #define _mm512_maskz_shrdi_epi64(U, A, B, I) \ - _mm512_mask_shrdi_epi64(_mm512_setzero_hi(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512()) -#define _mm512_shrdi_epi64(A, B, I) \ - _mm512_mask_shrdi_epi64(_mm512_undefined(), (__mmask8)(-1), (A), (B), (I)) +#define _mm512_shrdi_epi32(A, B, I) \ + (__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I)) -#define _mm512_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_vpshrdd512_mask((__v16si)(A), \ - (__v16si)(B), \ - (int)(I), \ - (__v16si)(S), \ - (__mmask16)(U)); }) +#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S)) #define _mm512_maskz_shrdi_epi32(U, A, B, I) \ - _mm512_mask_shrdi_epi32(_mm512_setzero_hi(), (U), (A), (B), (I)) + (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512()) -#define _mm512_shrdi_epi32(A, B, I) \ - _mm512_mask_shrdi_epi32(_mm512_undefined(), (__mmask16)(-1), (A), (B), (I)) +#define _mm512_shrdi_epi16(A, B, I) \ + (__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), (int)(I)) -#define _mm512_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ - (__m512i)__builtin_ia32_vpshrdw512_mask((__v32hi)(A), \ - (__v32hi)(B), \ - (int)(I), \ - (__v32hi)(S), \ - (__mmask32)(U)); }) +#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S)) #define _mm512_maskz_shrdi_epi16(U, A, B, I) \ - _mm512_mask_shrdi_epi16(_mm512_setzero_hi(), (U), (A), (B), (I)) - -#define _mm512_shrdi_epi16(A, B, I) \ - _mm512_mask_shrdi_epi16(_mm512_undefined(), (__mmask32)(-1), (A), (B), (I)) + (__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512()) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_shldv_epi64(__m512i __S, __mmask8 __U, __m512i __A, __m512i __B) diff --git a/c_headers/avx512vbmiintrin.h b/c_headers/avx512vbmiintrin.h index 837238eda9..b6e93c2858 100644 --- a/c_headers/avx512vbmiintrin.h +++ b/c_headers/avx512vbmiintrin.h @@ -29,79 +29,65 @@ #define __VBMIINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I, - __mmask64 __U, __m512i __B) +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) { - return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A, - (__v64qi) __I - /* idx */ , - (__v64qi) __B, - (__mmask64) __U); + return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, + (__v64qi) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B) +_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I - /* idx */ , - (__v64qi) __A, - (__v64qi) __B, - (__mmask64) -1); + return (__m512i)__builtin_ia32_selectb_512(__U, + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U, - __m512i __I, __m512i __B) +_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I - /* idx */ , - (__v64qi) __A, - (__v64qi) __B, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512(__U, + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)__I); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A, - __m512i __I, __m512i __B) +_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, + __m512i __B) { - return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I - /* idx */ , - (__v64qi) __A, - (__v64qi) __B, - (__mmask64) __U); + return (__m512i)__builtin_ia32_selectb_512(__U, + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_permutexvar_epi8 (__m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, - (__v64qi) __A, - (__v64qi) _mm512_undefined_epi32 (), - (__mmask64) -1); + return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, - (__v64qi) __A, - (__v64qi) _mm512_setzero_si512(), - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_permutexvar_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, - (__v64qi) __A, - (__v64qi) __W, - (__mmask64) __M); + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_permutexvar_epi8(__A, __B), + (__v64qi)__W); } static __inline__ __m512i __DEFAULT_FN_ATTRS diff --git a/c_headers/avx512vbmivlintrin.h b/c_headers/avx512vbmivlintrin.h index 105c6d142f..9a0400b2b5 100644 --- a/c_headers/avx512vbmivlintrin.h +++ b/c_headers/avx512vbmivlintrin.h @@ -29,161 +29,127 @@ #define __VBMIVLINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A, - (__v16qi) __I - /* idx */ , - (__v16qi) __B, - (__mmask16) - __U); + return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, + (__v16qi)__I, + (__v16qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I, - __mmask32 __U, __m256i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, + __m128i __B) { - return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A, - (__v32qi) __I - /* idx */ , - (__v32qi) __B, - (__mmask32) - __U); + return (__m128i)__builtin_ia32_selectb_128(__U, + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, + __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I - /* idx */ , - (__v16qi) __A, - (__v16qi) __B, - (__mmask16) - - 1); + return (__m128i)__builtin_ia32_selectb_128(__U, + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, + __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I - /* idx */ , - (__v16qi) __A, - (__v16qi) __B, - (__mmask16) - __U); + return (__m128i)__builtin_ia32_selectb_128(__U, + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) { - return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I - /* idx */ , - (__v16qi) __A, - (__v16qi) __B, - (__mmask16) - __U); + return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, + (__v32qi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, + __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I - /* idx */ , - (__v32qi) __A, - (__v32qi) __B, - (__mmask32) - - 1); + return (__m256i)__builtin_ia32_selectb_256(__U, + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U, - __m256i __I, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, + __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I - /* idx */ , - (__v32qi) __A, - (__v32qi) __B, - (__mmask32) - __U); + return (__m256i)__builtin_ia32_selectb_256(__U, + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A, - __m256i __I, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, + __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I - /* idx */ , - (__v32qi) __A, - (__v32qi) __B, - (__mmask32) - __U); + return (__m256i)__builtin_ia32_selectb_256(__U, + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_permutexvar_epi8 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, - (__v16qi) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask16) -1); + return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, - (__v16qi) __A, - (__v16qi) _mm_setzero_si128 (), - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_permutexvar_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, - (__v16qi) __A, - (__v16qi) __W, - (__mmask16) __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_permutexvar_epi8(__A, __B), + (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutexvar_epi8 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, - (__v32qi) __A, - (__v32qi) _mm256_undefined_si256 (), - (__mmask32) -1); + return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, - (__v32qi) __A, - (__v32qi) _mm256_setzero_si256 (), - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_permutexvar_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, - (__v32qi) __A, - (__v32qi) __W, - (__mmask32) __M); + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_permutexvar_epi8(__A, __B), + (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, @@ -192,7 +158,7 @@ _mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i (__mmask16) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, @@ -202,7 +168,7 @@ _mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y) (__mmask16) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y) { return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, @@ -212,7 +178,7 @@ _mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y) (__mmask16) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) { return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, @@ -221,7 +187,7 @@ _mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m2 (__mmask32) __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y) { return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, @@ -231,7 +197,7 @@ _mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y) (__mmask32) __M); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y) { return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, @@ -242,6 +208,7 @@ _mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y) } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avx512vlbitalgintrin.h b/c_headers/avx512vlbitalgintrin.h index 76eb87721b..64860b2925 100644 --- a/c_headers/avx512vlbitalgintrin.h +++ b/c_headers/avx512vlbitalgintrin.h @@ -1,4 +1,4 @@ -/*===------------- avx512vlbitalgintrin.h - BITALG intrinsics ------------------=== +/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------=== * * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -29,15 +29,16 @@ #define __AVX512VLBITALGINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256))) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi16(__m256i __A) { return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, @@ -45,7 +46,7 @@ _mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) (__v16hi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) { return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), @@ -53,35 +54,35 @@ _mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_popcnt_epi16(__m128i __A) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi16(__m128i __A) { return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, - (__v8hi) _mm128_popcnt_epi16(__B), + (__v8hi) _mm_popcnt_epi16(__B), (__v8hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) { - return _mm128_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), + return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), __U, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi8(__m256i __A) { return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, @@ -89,7 +90,7 @@ _mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) (__v32qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) { return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), @@ -97,61 +98,62 @@ _mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_popcnt_epi8(__m128i __A) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi8(__m128i __A) { return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, - (__v16qi) _mm128_popcnt_epi8(__B), + (__v16qi) _mm_popcnt_epi8(__B), (__v16qi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) { - return _mm128_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), + return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), __U, __B); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_mask_bitshuffle_epi32_mask(__mmask32 __U, __m256i __A, __m256i __B) +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B) { return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A, (__v32qi) __B, __U); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm256_bitshuffle_epi32_mask(__m256i __A, __m256i __B) +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) { - return _mm256_mask_bitshuffle_epi32_mask((__mmask32) -1, + return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1, __A, __B); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm128_mask_bitshuffle_epi16_mask(__mmask16 __U, __m128i __A, __m128i __B) +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B) { return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A, (__v16qi) __B, __U); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm128_bitshuffle_epi16_mask(__m128i __A, __m128i __B) +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) { - return _mm128_mask_bitshuffle_epi16_mask((__mmask16) -1, + return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1, __A, __B); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avx512vlbwintrin.h b/c_headers/avx512vlbwintrin.h index e940e2b685..1b038dd04d 100644 --- a/c_headers/avx512vlbwintrin.h +++ b/c_headers/avx512vlbwintrin.h @@ -29,94 +29,90 @@ #define __AVX512VLBWINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"))) - -static __inline __m128i __DEFAULT_FN_ATTRS -_mm_setzero_hi(void){ - return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }; -} +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256))) /* Integer compare */ -#define _mm_cmp_epi8_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epi8_mask(a, b, p) \ (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epi8_mask(m, a, b, p) \ (__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)); }) + (__mmask16)(m)) -#define _mm_cmp_epu8_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epu8_mask(a, b, p) \ (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epu8_mask(m, a, b, p) \ (__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m)); }) + (__mmask16)(m)) -#define _mm256_cmp_epi8_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epi8_mask(a, b, p) \ (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1); }) + (__mmask32)-1) -#define _mm256_mask_cmp_epi8_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \ (__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)); }) + (__mmask32)(m)) -#define _mm256_cmp_epu8_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epu8_mask(a, b, p) \ (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1); }) + (__mmask32)-1) -#define _mm256_mask_cmp_epu8_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \ (__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m)); }) + (__mmask32)(m)) -#define _mm_cmp_epi16_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epi16_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epi16_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm_cmp_epu16_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epu16_mask(a, b, p) \ (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epu16_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_epi16_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epi16_mask(a, b, p) \ (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm256_mask_cmp_epi16_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \ (__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)); }) + (__mmask16)(m)) -#define _mm256_cmp_epu16_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epu16_mask(a, b, p) \ (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1); }) + (__mmask16)-1) -#define _mm256_mask_cmp_epu16_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \ (__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m)); }) + (__mmask16)(m)) #define _mm_cmpeq_epi8_mask(A, B) \ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) @@ -318,147 +314,147 @@ _mm_setzero_hi(void){ #define _mm256_mask_cmpneq_epu16_mask(k, A, B) \ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_add_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_add_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_add_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_add_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_sub_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_sub_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_sub_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_sub_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_add_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_add_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_add_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_add_epi16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_sub_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_sub_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_sub_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_sub_epi16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mullo_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mullo_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mullo_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mullo_epi16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) { return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, @@ -466,7 +462,7 @@ _mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) (__v16qi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) { return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, @@ -474,7 +470,7 @@ _mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) (__v32qi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) { return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, @@ -482,7 +478,7 @@ _mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) (__v8hi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) { return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, @@ -490,7 +486,7 @@ _mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) (__v16hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -498,7 +494,7 @@ _mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -506,7 +502,7 @@ _mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -514,7 +510,7 @@ _mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -522,7 +518,7 @@ _mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -530,7 +526,7 @@ _mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -538,7 +534,7 @@ _mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -546,7 +542,7 @@ _mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -554,22 +550,22 @@ _mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_packs_epi32(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_packs_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_packs_epi32(__A, __B), (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -577,7 +573,7 @@ _mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -585,7 +581,7 @@ _mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -593,7 +589,7 @@ _mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -601,7 +597,7 @@ _mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -609,7 +605,7 @@ _mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -617,7 +613,7 @@ _mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -625,15 +621,15 @@ _mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_packus_epi32(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, (__v8hi)_mm_packus_epi32(__A, __B), (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -641,7 +637,7 @@ _mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -649,7 +645,7 @@ _mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -657,7 +653,7 @@ _mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -665,7 +661,7 @@ _mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -673,7 +669,7 @@ _mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -681,7 +677,7 @@ _mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -689,7 +685,7 @@ _mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -697,7 +693,7 @@ _mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -705,7 +701,7 @@ _mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -713,7 +709,7 @@ _mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -721,7 +717,7 @@ _mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -729,7 +725,7 @@ _mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -737,7 +733,7 @@ _mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -745,7 +741,7 @@ _mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -753,7 +749,7 @@ _mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -761,7 +757,7 @@ _mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -769,7 +765,7 @@ _mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -777,7 +773,7 @@ _mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -785,7 +781,7 @@ _mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -793,7 +789,7 @@ _mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -801,7 +797,7 @@ _mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -809,7 +805,7 @@ _mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -817,7 +813,7 @@ _mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -825,7 +821,7 @@ _mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -833,7 +829,7 @@ _mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -841,7 +837,7 @@ _mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -849,7 +845,7 @@ _mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -857,7 +853,7 @@ _mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -865,7 +861,7 @@ _mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -873,7 +869,7 @@ _mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -881,7 +877,7 @@ _mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -889,7 +885,7 @@ _mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -897,7 +893,7 @@ _mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -905,7 +901,7 @@ _mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -913,7 +909,7 @@ _mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -921,7 +917,7 @@ _mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -929,7 +925,7 @@ _mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -937,7 +933,7 @@ _mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -945,7 +941,7 @@ _mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -953,7 +949,7 @@ _mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -961,7 +957,7 @@ _mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -969,7 +965,7 @@ _mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -977,7 +973,7 @@ _mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -985,7 +981,7 @@ _mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -993,7 +989,7 @@ _mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -1001,7 +997,7 @@ _mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -1009,7 +1005,7 @@ _mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -1017,7 +1013,7 @@ _mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -1025,7 +1021,7 @@ _mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -1033,7 +1029,7 @@ _mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -1041,7 +1037,7 @@ _mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -1049,7 +1045,7 @@ _mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -1057,7 +1053,7 @@ _mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -1065,7 +1061,7 @@ _mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -1073,7 +1069,7 @@ _mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, @@ -1081,7 +1077,7 @@ _mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -1089,7 +1085,7 @@ _mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, @@ -1097,7 +1093,7 @@ _mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -1105,7 +1101,7 @@ _mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, @@ -1113,7 +1109,7 @@ _mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -1121,7 +1117,7 @@ _mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, @@ -1129,7 +1125,7 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1137,7 +1133,7 @@ _mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1145,7 +1141,7 @@ _mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1153,7 +1149,7 @@ _mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1161,7 +1157,7 @@ _mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1169,7 +1165,7 @@ _mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1177,7 +1173,7 @@ _mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1185,7 +1181,7 @@ _mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1193,7 +1189,7 @@ _mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1201,7 +1197,7 @@ _mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1209,7 +1205,7 @@ _mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1217,7 +1213,7 @@ _mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1225,7 +1221,7 @@ _mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1233,7 +1229,7 @@ _mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, @@ -1241,7 +1237,7 @@ _mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1249,7 +1245,7 @@ _mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, @@ -1257,7 +1253,7 @@ _mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1265,7 +1261,7 @@ _mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1273,7 +1269,7 @@ _mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1281,7 +1277,7 @@ _mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1289,99 +1285,89 @@ _mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A, - (__v8hi) __I /* idx */ , - (__v8hi) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, + (__v8hi) __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I, - __mmask16 __U, __m256i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { - return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A, - (__v16hi) __I /* idx */ , - (__v16hi) __B, - (__mmask16) __U); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */, - (__v8hi) __A, - (__v8hi) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)__I); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I/* idx */, - (__v8hi) __A, - (__v8hi) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { - return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I/* idx */, - (__v8hi) __A, - (__v8hi) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, + (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, + __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */, - (__v16hi) __A, - (__v16hi) __B, - (__mmask16) -1); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U, - __m256i __I, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, + __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I/* idx */, - (__v16hi) __A, - (__v16hi) __B, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)__I); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, - __m256i __I, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I/* idx */, - (__v16hi) __A, - (__v16hi) __B, - (__mmask16) __U); + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_maddubs_epi16(__X, __Y), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_maddubs_epi16(__X, __Y), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1389,402 +1375,400 @@ _mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_maddubs_epi16(__X, __Y), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_madd_epi16(__A, __B), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_madd_epi16(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_madd_epi16(__A, __B), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_madd_epi16(__A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsepi16_epi8 (__m128i __A) { return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsepi16_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, (__v16qi) _mm_setzero_si128(), (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, (__v16qi) _mm_setzero_si128(), __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtusepi16_epi8 (__m128i __A) { return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtusepi16_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, (__v16qi) _mm_setzero_si128(), (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, (__v16qi) _mm_setzero_si128(), __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi16_epi8 (__m128i __A) { - - return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, - (__v16qi) _mm_setzero_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8hi)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, (__v16qi) _mm_setzero_si128(), __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtepi16_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, - (__v16qi) _mm_setzero_si128(), - (__mmask16) -1); + return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, - (__v16qi) __O, - __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm256_cvtepi16_epi8(__A), + (__v16qi)__O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, - (__v16qi) _mm_setzero_si128(), - __M); + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm256_cvtepi16_epi8(__A), + (__v16qi)_mm_setzero_si128()); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) { __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) { __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M); } -static __inline__ void __DEFAULT_FN_ATTRS -_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) { __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhrs_epi16(__X, __Y), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhrs_epi16(__X, __Y), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhrs_epi16(__X, __Y), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhrs_epi16(__X, __Y), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhi_epu16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhi_epu16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhi_epu16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhi_epu16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhi_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_mulhi_epi16(__A, __B), (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhi_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_mulhi_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpackhi_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpackhi_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpackhi_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpackhi_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpackhi_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpacklo_epi8(__A, __B), (__v16qi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, (__v16qi)_mm_unpacklo_epi8(__A, __B), (__v16qi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, (__v32qi)_mm256_unpacklo_epi8(__A, __B), (__v32qi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpacklo_epi16(__A, __B), (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, (__v8hi)_mm_unpacklo_epi16(__A, __B), (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, (__v16hi)_mm256_unpacklo_epi16(__A, __B), (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1792,7 +1776,7 @@ _mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1800,7 +1784,7 @@ _mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1808,7 +1792,7 @@ _mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1817,7 +1801,7 @@ _mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1825,7 +1809,7 @@ _mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1833,7 +1817,7 @@ _mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1841,7 +1825,7 @@ _mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1850,55 +1834,55 @@ _mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) } -#define _mm_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ +#define _mm_mask_shufflehi_epi16(W, U, A, imm) \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W)); }) + (__v8hi)(__m128i)(W)) -#define _mm_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ +#define _mm_maskz_shufflehi_epi16(U, A, imm) \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_hi()); }) + (__v8hi)_mm_setzero_si128()) -#define _mm256_mask_shufflehi_epi16(W, U, A, imm) __extension__ ({ \ +#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)(__m256i)(W)); }) + (__v16hi)(__m256i)(W)) -#define _mm256_maskz_shufflehi_epi16(U, A, imm) __extension__ ({ \ +#define _mm256_maskz_shufflehi_epi16(U, A, imm) \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)_mm256_setzero_si256()); }) + (__v16hi)_mm256_setzero_si256()) -#define _mm_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ +#define _mm_mask_shufflelo_epi16(W, U, A, imm) \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W)); }) + (__v8hi)(__m128i)(W)) -#define _mm_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ +#define _mm_maskz_shufflelo_epi16(U, A, imm) \ (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_hi()); }) + (__v8hi)_mm_setzero_si128()) -#define _mm256_mask_shufflelo_epi16(W, U, A, imm) __extension__ ({ \ +#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__v16hi)_mm256_shufflelo_epi16((A), \ (imm)), \ - (__v16hi)(__m256i)(W)); }) + (__v16hi)(__m256i)(W)) -#define _mm256_maskz_shufflelo_epi16(U, A, imm) __extension__ ({ \ +#define _mm256_maskz_shufflelo_epi16(U, A, imm) \ (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ (__v16hi)_mm256_shufflelo_epi16((A), \ (imm)), \ - (__v16hi)_mm256_setzero_si256()); }) + (__v16hi)_mm256_setzero_si256()) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sllv_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1906,7 +1890,7 @@ _mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1914,13 +1898,13 @@ _mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sllv_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1928,7 +1912,7 @@ _mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1936,7 +1920,7 @@ _mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1944,7 +1928,7 @@ _mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1952,7 +1936,7 @@ _mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1960,7 +1944,7 @@ _mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1968,7 +1952,7 @@ _mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1976,7 +1960,7 @@ _mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -1984,7 +1968,7 @@ _mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -1992,7 +1976,7 @@ _mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2000,13 +1984,13 @@ _mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, int __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srlv_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2014,7 +1998,7 @@ _mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2022,13 +2006,13 @@ _mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srlv_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2036,7 +2020,7 @@ _mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2044,13 +2028,13 @@ _mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi16(__m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2058,7 +2042,7 @@ _mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2066,13 +2050,13 @@ _mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi16(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2080,7 +2064,7 @@ _mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2088,7 +2072,7 @@ _mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2096,7 +2080,7 @@ _mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2104,7 +2088,7 @@ _mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2112,7 +2096,7 @@ _mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2120,7 +2104,7 @@ _mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2128,7 +2112,7 @@ _mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2136,7 +2120,7 @@ _mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, int __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2144,7 +2128,7 @@ _mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2152,7 +2136,7 @@ _mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, int __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2160,7 +2144,7 @@ _mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2168,7 +2152,7 @@ _mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2176,7 +2160,7 @@ _mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2184,7 +2168,7 @@ _mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2192,7 +2176,7 @@ _mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v8hi)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, @@ -2200,7 +2184,7 @@ _mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) (__v8hi)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2208,7 +2192,7 @@ _mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) (__v16hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, @@ -2216,7 +2200,7 @@ _mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, @@ -2224,15 +2208,15 @@ _mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) (__v8hi) __W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, (__v8hi) __A, - (__v8hi) _mm_setzero_hi ()); + (__v8hi) _mm_setzero_si128 ()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, @@ -2240,7 +2224,7 @@ _mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) (__v16hi) __W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, @@ -2248,7 +2232,7 @@ _mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) (__v16hi) _mm256_setzero_si256 ()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, @@ -2256,15 +2240,15 @@ _mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) (__v16qi) __W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, (__v16qi) __A, - (__v16qi) _mm_setzero_hi ()); + (__v16qi) _mm_setzero_si128 ()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, @@ -2272,7 +2256,7 @@ _mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) (__v32qi) __W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, @@ -2281,7 +2265,7 @@ _mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) { return (__m128i) __builtin_ia32_selectb_128(__M, @@ -2289,7 +2273,7 @@ _mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) (__v16qi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi8 (__mmask16 __M, char __A) { return (__m128i) __builtin_ia32_selectb_128(__M, @@ -2297,7 +2281,7 @@ _mm_maskz_set1_epi8 (__mmask16 __M, char __A) (__v16qi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) { return (__m256i) __builtin_ia32_selectb_256(__M, @@ -2305,7 +2289,7 @@ _mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) (__v32qi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) { return (__m256i) __builtin_ia32_selectb_256(__M, @@ -2313,7 +2297,7 @@ _mm256_maskz_set1_epi8 (__mmask32 __M, char __A) (__v32qi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P, @@ -2321,16 +2305,16 @@ _mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddquhi128_mask ((__v8hi *) __P, (__v8hi) - _mm_setzero_hi (), + _mm_setzero_si128 (), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P, @@ -2338,7 +2322,7 @@ _mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddquhi256_mask ((__v16hi *) __P, @@ -2347,7 +2331,7 @@ _mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P, @@ -2355,7 +2339,7 @@ _mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddquqi128_mask ((__v16qi *) __P, @@ -2364,7 +2348,7 @@ _mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) (__mmask16) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P, @@ -2372,7 +2356,7 @@ _mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) (__mmask32) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddquqi256_mask ((__v32qi *) __P, @@ -2380,7 +2364,7 @@ _mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) _mm256_setzero_si256 (), (__mmask32) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedquhi128_mask ((__v8hi *) __P, @@ -2388,7 +2372,7 @@ _mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) { __builtin_ia32_storedquhi256_mask ((__v16hi *) __P, @@ -2396,7 +2380,7 @@ _mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) (__mmask16) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) { __builtin_ia32_storedquqi128_mask ((__v16qi *) __P, @@ -2404,7 +2388,7 @@ _mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) (__mmask16) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) { __builtin_ia32_storedquqi256_mask ((__v32qi *) __P, @@ -2412,162 +2396,162 @@ _mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) (__mmask32) __U); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 _mm_test_epi8_mask (__m128i __A, __m128i __B) { - return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_hi()); + return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 _mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_hi()); + _mm_setzero_si128()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 _mm256_test_epi8_mask (__m256i __A, __m256i __B) { return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 _mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_test_epi16_mask (__m128i __A, __m128i __B) { - return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); + return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_hi()); + _mm_setzero_si128()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 _mm256_test_epi16_mask (__m256i __A, __m256i __B) { return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B), _mm256_setzero_si256 ()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 _mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 _mm_testn_epi8_mask (__m128i __A, __m128i __B) { - return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); + return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 _mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_hi()); + _mm_setzero_si128()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 _mm256_testn_epi8_mask (__m256i __A, __m256i __B) { return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 _mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_testn_epi16_mask (__m128i __A, __m128i __B) { - return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_hi()); + return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) { - return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_hi()); + return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 _mm256_testn_epi16_mask (__m256i __A, __m256i __B) { return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 _mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 _mm_movepi8_mask (__m128i __A) { return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A); } -static __inline__ __mmask32 __DEFAULT_FN_ATTRS +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 _mm256_movepi8_mask (__m256i __A) { return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_movepi16_mask (__m128i __A) { return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A); } -static __inline__ __mmask16 __DEFAULT_FN_ATTRS +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 _mm256_movepi16_mask (__m256i __A) { return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_movm_epi8 (__mmask16 __A) { return (__m128i) __builtin_ia32_cvtmask2b128 (__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_movm_epi8 (__mmask32 __A) { return (__m256i) __builtin_ia32_cvtmask2b256 (__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_movm_epi16 (__mmask8 __A) { return (__m128i) __builtin_ia32_cvtmask2w128 (__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_movm_epi16 (__mmask16 __A) { return (__m256i) __builtin_ia32_cvtmask2w256 (__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128(__M, @@ -2575,7 +2559,7 @@ _mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) (__v16qi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectb_128(__M, @@ -2583,7 +2567,7 @@ _mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) (__v16qi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectb_256(__M, @@ -2591,7 +2575,7 @@ _mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) (__v32qi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectb_256(__M, @@ -2599,7 +2583,7 @@ _mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) (__v32qi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128(__M, @@ -2607,7 +2591,7 @@ _mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) (__v8hi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectw_128(__M, @@ -2615,7 +2599,7 @@ _mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) (__v8hi) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256(__M, @@ -2623,7 +2607,7 @@ _mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) (__v16hi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectw_256(__M, @@ -2631,7 +2615,7 @@ _mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) (__v16hi) _mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) { return (__m256i) __builtin_ia32_selectw_256 (__M, @@ -2639,7 +2623,7 @@ _mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) (__v16hi) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) { return (__m256i) __builtin_ia32_selectw_256(__M, @@ -2647,7 +2631,7 @@ _mm256_maskz_set1_epi16 (__mmask16 __M, short __A) (__v16hi) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) { return (__m128i) __builtin_ia32_selectw_128(__M, @@ -2655,7 +2639,7 @@ _mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) (__v8hi) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi16 (__mmask8 __M, short __A) { return (__m128i) __builtin_ia32_selectw_128(__M, @@ -2663,119 +2647,102 @@ _mm_maskz_set1_epi16 (__mmask8 __M, short __A) (__v8hi) _mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_permutexvar_epi16 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, - (__v8hi) __A, - (__v8hi) _mm_undefined_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, - (__v8hi) __A, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_permutexvar_epi16(__A, __B), + (__v8hi) _mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, - (__v8hi) __A, - (__v8hi) __W, - (__mmask8) __M); + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_permutexvar_epi16(__A, __B), + (__v8hi)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_permutexvar_epi16 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, - (__v16hi) __A, - (__v16hi) _mm256_undefined_si256 (), - (__mmask16) -1); + return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, - (__v16hi) __A, - (__v16hi) _mm256_setzero_si256 (), - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_permutexvar_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, - (__v16hi) __A, - (__v16hi) __W, - (__mmask16) __M); + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_permutexvar_epi16(__A, __B), + (__v16hi)__W); } -#define _mm_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \ +#define _mm_mask_alignr_epi8(W, U, A, B, N) \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)(__m128i)(W)); }) + (__v16qi)(__m128i)(W)) -#define _mm_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \ +#define _mm_maskz_alignr_epi8(U, A, B, N) \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)_mm_setzero_si128()); }) + (__v16qi)_mm_setzero_si128()) -#define _mm256_mask_alignr_epi8(W, U, A, B, N) __extension__ ({ \ +#define _mm256_mask_alignr_epi8(W, U, A, B, N) \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)(__m256i)(W)); }) + (__v32qi)(__m256i)(W)) -#define _mm256_maskz_alignr_epi8(U, A, B, N) __extension__ ({ \ +#define _mm256_maskz_alignr_epi8(U, A, B, N) \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)_mm256_setzero_si256()); }) - -#define _mm_dbsad_epu8(A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(imm), \ - (__v8hi)_mm_setzero_hi(), \ - (__mmask8)-1); }) - -#define _mm_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(imm), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) - -#define _mm_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_dbpsadbw128_mask((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(imm), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)); }) - -#define _mm256_dbsad_epu8(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(imm), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)-1); }) - -#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(imm), \ - (__v16hi)(__m256i)(W), \ - (__mmask16)(U)); }) - -#define _mm256_maskz_dbsad_epu8(U, A, B, imm) __extension__ ({ \ - (__m256i)__builtin_ia32_dbpsadbw256_mask((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(imm), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(U)); }) - -#undef __DEFAULT_FN_ATTRS + (__v32qi)_mm256_setzero_si256()) + +#define _mm_dbsad_epu8(A, B, imm) \ + (__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(imm)) + +#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \ + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ + (__v8hi)(__m128i)(W)) + +#define _mm_maskz_dbsad_epu8(U, A, B, imm) \ + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ + (__v8hi)_mm_setzero_si128()) + +#define _mm256_dbsad_epu8(A, B, imm) \ + (__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), (int)(imm)) + +#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \ + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ + (__v16hi)(__m256i)(W)) + +#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \ + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ + (__v16hi)_mm256_setzero_si256()) + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif /* __AVX512VLBWINTRIN_H */ diff --git a/c_headers/avx512vlcdintrin.h b/c_headers/avx512vlcdintrin.h index 8f1cd25f0b..903a7c2549 100644 --- a/c_headers/avx512vlcdintrin.h +++ b/c_headers/avx512vlcdintrin.h @@ -1,4 +1,4 @@ -/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ---------------------------=== +/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -28,35 +28,36 @@ #define __AVX512VLCDINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastmb_epi64 (__mmask8 __A) -{ +{ return (__m128i) _mm_set1_epi64x((long long) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastmb_epi64 (__mmask8 __A) { return (__m256i) _mm256_set1_epi64x((long long)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcastmw_epi32 (__mmask16 __A) { return (__m128i) _mm_set1_epi32((int)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcastmw_epi32 (__mmask16 __A) { return (__m256i) _mm256_set1_epi32((int)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_conflict_epi64 (__m128i __A) { return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, @@ -64,7 +65,7 @@ _mm_conflict_epi64 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, @@ -72,16 +73,16 @@ _mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, (__v2di) - _mm_setzero_di (), + _mm_setzero_si128 (), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_conflict_epi64 (__m256i __A) { return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, @@ -89,7 +90,7 @@ _mm256_conflict_epi64 (__m256i __A) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, @@ -97,7 +98,7 @@ _mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, @@ -105,7 +106,7 @@ _mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_conflict_epi32 (__m128i __A) { return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, @@ -113,7 +114,7 @@ _mm_conflict_epi32 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, @@ -121,7 +122,7 @@ _mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, @@ -129,7 +130,7 @@ _mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_conflict_epi32 (__m256i __A) { return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, @@ -137,7 +138,7 @@ _mm256_conflict_epi32 (__m256i __A) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, @@ -145,7 +146,7 @@ _mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, @@ -154,110 +155,95 @@ _mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_lzcnt_epi32 (__m128i __A) { - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_lzcnt_epi32(__A), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_lzcnt_epi32(__A), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_lzcnt_epi32 (__m256i __A) { - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_lzcnt_epi32(__A), + (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_lzcnt_epi32(__A), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_lzcnt_epi64 (__m128i __A) { - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_di (), - (__mmask8) -1); + return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_lzcnt_epi64(__A), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_lzcnt_epi64(__A), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_lzcnt_epi64 (__m256i __A) { - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_lzcnt_epi64(__A), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_lzcnt_epi64(__A), + (__v4di)_mm256_setzero_si256()); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif /* __AVX512VLCDINTRIN_H */ diff --git a/c_headers/avx512vldqintrin.h b/c_headers/avx512vldqintrin.h index d80df9eaff..9d13846e89 100644 --- a/c_headers/avx512vldqintrin.h +++ b/c_headers/avx512vldqintrin.h @@ -29,961 +29,953 @@ #define __AVX512VLDQINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256))) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mullo_epi64 (__m256i __A, __m256i __B) { return (__m256i) ((__v4du) __A * (__v4du) __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_mullo_epi64(__A, __B), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, (__v4di)_mm256_mullo_epi64(__A, __B), (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mullo_epi64 (__m128i __A, __m128i __B) { return (__m128i) ((__v2du) __A * (__v2du) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_mullo_epi64(__A, __B), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_mullo_epi64(__A, __B), (__v2di)_mm_setzero_si128()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_andnot_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_andnot_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_andnot_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_andnot_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_andnot_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_andnot_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_andnot_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_andnot_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_and_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_and_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_and_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_and_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_and_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_and_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_and_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_and_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_xor_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_xor_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_xor_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_xor_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_xor_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_xor_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_xor_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_xor_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_or_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_or_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_or_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_or_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_or_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_or_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_or_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_or_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtpd_epi64 (__m128d __A) { return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtpd_epi64 (__m256d __A) { return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtpd_epu64 (__m128d __A) { return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtpd_epu64 (__m256d __A) { return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtps_epi64 (__m128 __A) { return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtps_epi64 (__m128 __A) { return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtps_epu64 (__m128 __A) { return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtps_epu64 (__m128 __A) { return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtepi64_pd (__m128i __A) { - return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, - (__v2df) _mm_setzero_pd(), - (__mmask8) -1); + return (__m128d)__builtin_convertvector((__v2di)__A, __v2df); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepi64_pd(__A), + (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, - (__v2df) _mm_setzero_pd(), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepi64_pd(__A), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtepi64_pd (__m256i __A) { - return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, - (__v4df) _mm256_setzero_pd(), - (__mmask8) -1); + return (__m256d)__builtin_convertvector((__v4di)__A, __v4df); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) { - return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepi64_pd(__A), + (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) { - return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, - (__v4df) _mm256_setzero_pd(), - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepi64_pd(__A), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ps (__m128i __A) { return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_cvtepi64_ps (__m256i __A) { return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttpd_epi64 (__m128d __A) { return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvttpd_epi64 (__m256d __A) { return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttpd_epu64 (__m128d __A) { return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvttpd_epu64 (__m256d __A) { return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) { return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttps_epi64 (__m128 __A) { return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvttps_epi64 (__m128 __A) { return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttps_epu64 (__m128 __A) { return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, (__v2di) _mm_setzero_si128(), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvttps_epu64 (__m128 __A) { return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, (__v4di) _mm256_setzero_si256(), (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtepu64_pd (__m128i __A) { - return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, - (__v2df) _mm_setzero_pd(), - (__mmask8) -1); + return (__m128d)__builtin_convertvector((__v2du)__A, __v2df); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepu64_pd(__A), + (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) { - return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, - (__v2df) _mm_setzero_pd(), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepu64_pd(__A), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtepu64_pd (__m256i __A) { - return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, - (__v4df) _mm256_setzero_pd(), - (__mmask8) -1); + return (__m256d)__builtin_convertvector((__v4du)__A, __v4df); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) { - return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepu64_pd(__A), + (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) { - return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, - (__v4df) _mm256_setzero_pd(), - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepu64_pd(__A), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ps (__m128i __A) { return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_cvtepu64_ps (__m256i __A) { return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, (__v4sf) _mm_setzero_ps(), (__mmask8) __U); } -#define _mm_range_pd(A, B, C) __extension__ ({ \ +#define _mm_range_pd(A, B, C) \ (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), (int)(C), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_range_pd(W, U, A, B, C) __extension__ ({ \ +#define _mm_mask_range_pd(W, U, A, B, C) \ (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), (int)(C), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_range_pd(U, A, B, C) __extension__ ({ \ +#define _mm_maskz_range_pd(U, A, B, C) \ (__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), (int)(C), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_range_pd(A, B, C) __extension__ ({ \ +#define _mm256_range_pd(A, B, C) \ (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), (int)(C), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_range_pd(W, U, A, B, C) __extension__ ({ \ +#define _mm256_mask_range_pd(W, U, A, B, C) \ (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), (int)(C), \ (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_range_pd(U, A, B, C) __extension__ ({ \ +#define _mm256_maskz_range_pd(U, A, B, C) \ (__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), (int)(C), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_range_ps(A, B, C) __extension__ ({ \ +#define _mm_range_ps(A, B, C) \ (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (int)(C), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_range_ps(W, U, A, B, C) __extension__ ({ \ +#define _mm_mask_range_ps(W, U, A, B, C) \ (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U)); }) + (__v4sf)(__m128)(W), (__mmask8)(U)) -#define _mm_maskz_range_ps(U, A, B, C) __extension__ ({ \ +#define _mm_maskz_range_ps(U, A, B, C) \ (__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), (int)(C), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_range_ps(A, B, C) __extension__ ({ \ +#define _mm256_range_ps(A, B, C) \ (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), (int)(C), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_range_ps(W, U, A, B, C) __extension__ ({ \ +#define _mm256_mask_range_ps(W, U, A, B, C) \ (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)(__m256)(W), (__mmask8)(U)); }) + (__v8sf)(__m256)(W), (__mmask8)(U)) -#define _mm256_maskz_range_ps(U, A, B, C) __extension__ ({ \ +#define _mm256_maskz_range_ps(U, A, B, C) \ (__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), (int)(C), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_reduce_pd(A, B) __extension__ ({ \ +#define _mm_reduce_pd(A, B) \ (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_reduce_pd(W, U, A, B) __extension__ ({ \ +#define _mm_mask_reduce_pd(W, U, A, B) \ (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_reduce_pd(U, A, B) __extension__ ({ \ +#define _mm_maskz_reduce_pd(U, A, B) \ (__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_reduce_pd(A, B) __extension__ ({ \ +#define _mm256_reduce_pd(A, B) \ (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_reduce_pd(W, U, A, B) __extension__ ({ \ +#define _mm256_mask_reduce_pd(W, U, A, B) \ (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_reduce_pd(U, A, B) __extension__ ({ \ +#define _mm256_maskz_reduce_pd(U, A, B) \ (__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_reduce_ps(A, B) __extension__ ({ \ +#define _mm_reduce_ps(A, B) \ (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_reduce_ps(W, U, A, B) __extension__ ({ \ +#define _mm_mask_reduce_ps(W, U, A, B) \ (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_reduce_ps(U, A, B) __extension__ ({ \ +#define _mm_maskz_reduce_ps(U, A, B) \ (__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_reduce_ps(A, B) __extension__ ({ \ +#define _mm256_reduce_ps(A, B) \ (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_reduce_ps(W, U, A, B) __extension__ ({ \ +#define _mm256_mask_reduce_ps(W, U, A, B) \ (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_reduce_ps(U, A, B) __extension__ ({ \ +#define _mm256_maskz_reduce_ps(U, A, B) \ (__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_movepi32_mask (__m128i __A) { return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_movepi32_mask (__m256i __A) { return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_movm_epi32 (__mmask8 __A) { return (__m128i) __builtin_ia32_cvtmask2d128 (__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_movm_epi32 (__mmask8 __A) { return (__m256i) __builtin_ia32_cvtmask2d256 (__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_movm_epi64 (__mmask8 __A) { return (__m128i) __builtin_ia32_cvtmask2q128 (__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_movm_epi64 (__mmask8 __A) { return (__m256i) __builtin_ia32_cvtmask2q256 (__A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_movepi64_mask (__m128i __A) { return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_movepi64_mask (__m256i __A) { return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcast_f32x2 (__m128 __A) { - return (__m256)__builtin_shufflevector((__v4sf)__A, - (__v4sf)_mm_undefined_ps(), + return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -991,7 +983,7 @@ _mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) (__v8sf)__O); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -999,14 +991,14 @@ _mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_broadcast_f64x2(__m128d __A) { return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, 0, 1, 0, 1); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, @@ -1014,7 +1006,7 @@ _mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) (__v4df)__O); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, @@ -1022,15 +1014,14 @@ _mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_broadcast_i32x2 (__m128i __A) { - return (__m128i)__builtin_shufflevector((__v4si)__A, - (__v4si)_mm_undefined_si128(), + return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 0, 1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, @@ -1038,7 +1029,7 @@ _mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) (__v4si)__O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, @@ -1046,15 +1037,14 @@ _mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcast_i32x2 (__m128i __A) { - return (__m256i)__builtin_shufflevector((__v4si)__A, - (__v4si)_mm_undefined_si128(), + return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 0, 1, 0, 1, 0, 1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -1062,7 +1052,7 @@ _mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) (__v8si)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -1070,14 +1060,14 @@ _mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcast_i64x2(__m128i __A) { return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, 0, 1, 0, 1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -1085,7 +1075,7 @@ _mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) (__v4di)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -1093,106 +1083,103 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) (__v4di)_mm256_setzero_si256()); } -#define _mm256_extractf64x2_pd(A, imm) __extension__ ({ \ - (__m128d)__builtin_shufflevector((__v4df)(__m256d)(A), \ - (__v4df)_mm256_undefined_pd(), \ - ((imm) & 1) ? 2 : 0, \ - ((imm) & 1) ? 3 : 1); }) - -#define _mm256_mask_extractf64x2_pd(W, U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm256_extractf64x2_pd((A), (imm)), \ - (__v2df)(W)); }) - -#define _mm256_maskz_extractf64x2_pd(U, A, imm) __extension__ ({ \ - (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm256_extractf64x2_pd((A), (imm)), \ - (__v2df)_mm_setzero_pd()); }) - -#define _mm256_extracti64x2_epi64(A, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v4di)(__m256i)(A), \ - (__v4di)_mm256_undefined_si256(), \ - ((imm) & 1) ? 2 : 0, \ - ((imm) & 1) ? 3 : 1); }) - -#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ - (__v2di)(W)); }) - -#define _mm256_maskz_extracti64x2_epi64(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm256_extracti64x2_epi64((A), (imm)), \ - (__v2di)_mm_setzero_di()); }) - -#define _mm256_insertf64x2(A, B, imm) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(A), \ - (__v4df)_mm256_castpd128_pd256((__m128d)(B)), \ - ((imm) & 0x1) ? 0 : 4, \ - ((imm) & 0x1) ? 1 : 5, \ - ((imm) & 0x1) ? 4 : 2, \ - ((imm) & 0x1) ? 5 : 3); }) - -#define _mm256_mask_insertf64x2(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_extractf64x2_pd(A, imm) \ + (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1) + +#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \ + (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U)) + +#define _mm256_maskz_extractf64x2_pd(U, A, imm) \ + (__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U)) + +#define _mm256_extracti64x2_epi64(A, imm) \ + (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)_mm_undefined_si128(), \ + (__mmask8)-1) + +#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \ + (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U)) + +#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \ + (__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U)) + +#define _mm256_insertf64x2(A, B, imm) \ + (__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ + (__v2df)(__m128d)(B), (int)(imm)) + +#define _mm256_mask_insertf64x2(W, U, A, B, imm) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)(W)); }) + (__v4df)(__m256d)(W)) -#define _mm256_maskz_insertf64x2(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_insertf64x2(U, A, B, imm) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd()); }) + (__v4df)_mm256_setzero_pd()) -#define _mm256_inserti64x2(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(A), \ - (__v4di)_mm256_castsi128_si256((__m128i)(B)), \ - ((imm) & 0x1) ? 0 : 4, \ - ((imm) & 0x1) ? 1 : 5, \ - ((imm) & 0x1) ? 4 : 2, \ - ((imm) & 0x1) ? 5 : 3); }) +#define _mm256_inserti64x2(A, B, imm) \ + (__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ + (__v2di)(__m128i)(B), (int)(imm)) -#define _mm256_mask_inserti64x2(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_mask_inserti64x2(W, U, A, B, imm) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)(W)); }) + (__v4di)(__m256i)(W)) -#define _mm256_maskz_inserti64x2(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_inserti64x2(U, A, B, imm) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256()); }) + (__v4di)_mm256_setzero_si256()) -#define _mm_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ +#define _mm_mask_fpclass_pd_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_fpclass_pd_mask(A, imm) __extension__ ({ \ +#define _mm_fpclass_pd_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_fpclass_pd_mask(U, A, imm) __extension__ ({ \ +#define _mm256_mask_fpclass_pd_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_fpclass_pd_mask(A, imm) __extension__ ({ \ +#define _mm256_fpclass_pd_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ +#define _mm_mask_fpclass_ps_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_fpclass_ps_mask(A, imm) __extension__ ({ \ +#define _mm_fpclass_ps_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_fpclass_ps_mask(U, A, imm) __extension__ ({ \ +#define _mm256_mask_fpclass_ps_mask(U, A, imm) \ (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_fpclass_ps_mask(A, imm) __extension__ ({ \ +#define _mm256_fpclass_ps_mask(A, imm) \ (__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avx512vlintrin.h b/c_headers/avx512vlintrin.h index fb8056e3f8..0ee1d00ef4 100644 --- a/c_headers/avx512vlintrin.h +++ b/c_headers/avx512vlintrin.h @@ -28,13 +28,12 @@ #ifndef __AVX512VLINTRIN_H #define __AVX512VLINTRIN_H -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256))) -/* Doesn't require avx512vl, used in avx512dqintrin.h */ -static __inline __m128i __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) -_mm_setzero_di(void) { - return (__m128i)(__v2di){ 0LL, 0LL}; -} +typedef short __v2hi __attribute__((__vector_size__(4))); +typedef char __v4qi __attribute__((__vector_size__(4))); +typedef char __v2qi __attribute__((__vector_size__(2))); /* Integer compare */ @@ -238,7 +237,7 @@ _mm_setzero_di(void) { #define _mm256_mask_cmpneq_epu64_mask(k, A, B) \ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -246,7 +245,7 @@ _mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -254,7 +253,7 @@ _mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -262,7 +261,7 @@ _mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -270,7 +269,7 @@ _mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -278,7 +277,7 @@ _mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -286,7 +285,7 @@ _mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -294,7 +293,7 @@ _mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -302,7 +301,7 @@ _mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -310,7 +309,7 @@ _mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -318,7 +317,7 @@ _mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -326,7 +325,7 @@ _mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -334,7 +333,7 @@ _mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) (__v2di)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -342,7 +341,7 @@ _mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -350,7 +349,7 @@ _mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -358,7 +357,7 @@ _mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -366,7 +365,7 @@ _mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -374,7 +373,7 @@ _mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -382,7 +381,7 @@ _mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, @@ -390,7 +389,7 @@ _mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, @@ -398,7 +397,7 @@ _mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -406,7 +405,7 @@ _mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, @@ -414,7 +413,7 @@ _mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, @@ -422,7 +421,7 @@ _mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, @@ -430,7 +429,7 @@ _mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -438,7 +437,7 @@ _mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -446,7 +445,7 @@ _mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, @@ -454,7 +453,7 @@ _mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, @@ -462,7 +461,7 @@ _mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -470,13 +469,13 @@ _mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -484,13 +483,13 @@ _mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -498,14 +497,14 @@ _mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -513,13 +512,13 @@ _mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -527,13 +526,13 @@ _mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -541,13 +540,13 @@ _mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -555,13 +554,13 @@ _mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { @@ -570,13 +569,13 @@ _mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -584,13 +583,13 @@ _mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -598,13 +597,13 @@ _mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -612,14 +611,14 @@ _mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -627,13 +626,13 @@ _mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -641,13 +640,13 @@ _mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -655,13 +654,13 @@ _mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -669,13 +668,13 @@ _mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { @@ -684,909 +683,973 @@ _mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B); } -#define _mm_cmp_epi32_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epi32_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epi32_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm_cmp_epu32_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epu32_mask(a, b, p) \ (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epu32_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_epi32_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epi32_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_epu32_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epu32_mask(a, b, p) \ (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm_cmp_epi64_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epi64_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epi64_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm_cmp_epu64_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_epu64_mask(a, b, p) \ (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_epu64_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_epi64_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epi64_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_epu64_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_epu64_mask(a, b, p) \ (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_ps_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_ps_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ (__v8sf)(__m256)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_cmp_ps_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_ps_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ (__v8sf)(__m256)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm256_cmp_pd_mask(a, b, p) __extension__ ({ \ +#define _mm256_cmp_pd_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ (__v4df)(__m256d)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_cmp_pd_mask(m, a, b, p) __extension__ ({ \ +#define _mm256_mask_cmp_pd_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ (__v4df)(__m256d)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm_cmp_ps_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_ps_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ (__v4sf)(__m128)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_ps_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_ps_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ (__v4sf)(__m128)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -#define _mm_cmp_pd_mask(a, b, p) __extension__ ({ \ +#define _mm_cmp_pd_mask(a, b, p) \ (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ (__v2df)(__m128d)(b), (int)(p), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_cmp_pd_mask(m, a, b, p) __extension__ ({ \ +#define _mm_mask_cmp_pd_mask(m, a, b, p) \ (__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ (__v2df)(__m128d)(b), (int)(p), \ - (__mmask8)(m)); }) + (__mmask8)(m)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddpd128_mask3 (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddpd128_maskz (-(__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmaddpd256_mask3 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddpd256_maskz (-(__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddps128_mask3 (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddps128_maskz (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmaddps256_mask3 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddps256_maskz (-(__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) - __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) - __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + (__v2df) __C), + (__v2df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + (__v4df) __C), + (__v4df) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) { - return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C), + (__v2df) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) { - return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A, - (__v2df) __B, - (__v2df) __C, - (__mmask8) __U); + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C), + (__v2df) __C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) { - return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + -(__v4df) __C), + (__v4df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) { - return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C, - (__mmask8) __U); + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + -(__v4df) __C), + (__v4df) __C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) { - return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) { - return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C, - (__mmask8) __U); + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) { - return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) { - return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C, - (__mmask8) __U); + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_add_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_add_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_add_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_add_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) { return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, (__v4si) __W, (__v4si) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) { return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, (__v8si) __W, (__v8si) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) { return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, (__v2df) __W, (__v2df) __A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) { return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, (__v4df) __W, (__v4df) __A); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) { return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, (__v4sf) __W, (__v4sf) __A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) { return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, (__v8sf) __W, (__v8sf) __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) { return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, (__v2di) __W, (__v2di) __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) { return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, (__v4di) __W, (__v4di) __A); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, (__v2df) __W, (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_compress_pd (__mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, (__v2df) @@ -1594,14 +1657,14 @@ _mm_maskz_compress_pd (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, (__v4df) __W, (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, (__v4df) @@ -1609,14 +1672,14 @@ _mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, (__v2di) @@ -1624,14 +1687,14 @@ _mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, (__v4di) @@ -1639,14 +1702,14 @@ _mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) { (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_compress_ps (__mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, (__v4sf) @@ -1654,14 +1717,14 @@ _mm_maskz_compress_ps (__mmask8 __U, __m128 __A) { (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, (__v8sf) __W, (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, (__v8sf) @@ -1669,14 +1732,14 @@ _mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, (__v4si) @@ -1684,14 +1747,14 @@ _mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, (__v8si) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, (__v8si) @@ -1699,128 +1762,126 @@ _mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) { (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, (__v2df) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) { __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, (__v4df) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, (__v2di) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, (__v4di) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, (__v4sf) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) { __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, (__v8sf) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, (__v4si) __A, (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, (__v8si) __A, (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, (__v2df)_mm_cvtepi32_pd(__A), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, (__v2df)_mm_cvtepi32_pd(__A), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, (__v4df)_mm256_cvtepi32_pd(__A), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, (__v4df)_mm256_cvtepi32_pd(__A), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepi32_ps(__A), + (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi32_ps (__mmask16 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepi32_ps(__A), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepi32_ps(__A), + (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi32_ps (__mmask16 __U, __m256i __A) { - return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepi32_ps(__A), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, (__v4si) @@ -1828,29 +1889,28 @@ _mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvtpd_epi32(__A), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvtpd_epi32(__A), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) { return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) { return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, (__v4sf) @@ -1858,22 +1918,21 @@ _mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) { - return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtpd_ps(__A), + (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) { - return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtpd_ps(__A), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtpd_epu32 (__m128d __A) { return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, (__v4si) @@ -1881,14 +1940,14 @@ _mm_cvtpd_epu32 (__m128d __A) { (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, (__v4si) @@ -1896,7 +1955,7 @@ _mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtpd_epu32 (__m256d __A) { return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, (__v4si) @@ -1904,14 +1963,14 @@ _mm256_cvtpd_epu32 (__m256d __A) { (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) { return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, (__v4si) @@ -1919,67 +1978,63 @@ _mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtps_epi32(__A), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtps_epi32(__A), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtps_epi32(__A), + (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtps_epi32(__A), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) { - return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, - (__v2df) __W, - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtps_pd(__A), + (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtps_pd(__A), + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) { - return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, - (__v4df) __W, - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtps_pd(__A), + (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtps_pd(__A), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtps_epu32 (__m128 __A) { return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, (__v4si) @@ -1987,14 +2042,14 @@ _mm_cvtps_epu32 (__m128 __A) { (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, (__v4si) @@ -2002,7 +2057,7 @@ _mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvtps_epu32 (__m256 __A) { return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, (__v8si) @@ -2010,14 +2065,14 @@ _mm256_cvtps_epu32 (__m256 __A) { (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, (__v8si) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) { return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, (__v8si) @@ -2025,14 +2080,14 @@ _mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, (__v4si) @@ -2040,22 +2095,21 @@ _mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvttpd_epi32(__A), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvttpd_epi32(__A), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttpd_epu32 (__m128d __A) { return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, (__v4si) @@ -2063,14 +2117,14 @@ _mm_cvttpd_epu32 (__m128d __A) { (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) { return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, (__v4si) @@ -2078,7 +2132,7 @@ _mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvttpd_epu32 (__m256d __A) { return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, (__v4si) @@ -2086,14 +2140,14 @@ _mm256_cvttpd_epu32 (__m256d __A) { (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) { return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, (__v4si) @@ -2101,37 +2155,35 @@ _mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvttps_epi32(__A), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvttps_epi32(__A), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvttps_epi32(__A), + (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvttps_epi32(__A), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttps_epu32 (__m128 __A) { return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, (__v4si) @@ -2139,14 +2191,14 @@ _mm_cvttps_epu32 (__m128 __A) { (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, (__v4si) @@ -2154,7 +2206,7 @@ _mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cvttps_epu32 (__m256 __A) { return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, (__v8si) @@ -2162,14 +2214,14 @@ _mm256_cvttps_epu32 (__m256 __A) { (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, (__v8si) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) { return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, (__v8si) @@ -2177,155 +2229,147 @@ _mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) { (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtepu32_pd (__m128i __A) { return (__m128d) __builtin_convertvector( __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, (__v2df)_mm_cvtepu32_pd(__A), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, (__v2df)_mm_cvtepu32_pd(__A), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_pd (__m128i __A) { return (__m256d)__builtin_convertvector((__v4su)__A, __v4df); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, (__v4df)_mm256_cvtepu32_pd(__A), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, (__v4df)_mm256_cvtepu32_pd(__A), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ps (__m128i __A) { - return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); + return (__m128)__builtin_convertvector((__v4su)__A, __v4sf); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, - (__v4sf) __W, - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepu32_ps(__A), + (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepu32_ps(__A), + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtepu32_ps (__m256i __A) { - return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); + return (__m256)__builtin_convertvector((__v8su)__A, __v8sf); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, - (__v8sf) __W, - (__mmask8) __U); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepu32_ps(__A), + (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) { - return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepu32_ps(__A), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_div_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_div_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_div_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_div_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, (__v2df) __W, (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_expand_pd (__mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, (__v2df) @@ -2333,14 +2377,14 @@ _mm_maskz_expand_pd (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, (__v4df) __W, (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, (__v4df) @@ -2348,14 +2392,14 @@ _mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, (__v2di) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, (__v2di) @@ -2363,14 +2407,14 @@ _mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, (__v4di) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, (__v4di) @@ -2378,7 +2422,7 @@ _mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) { (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) { return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P, (__v2df) __W, @@ -2386,7 +2430,7 @@ _mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) { __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P, (__v2df) @@ -2395,7 +2439,7 @@ _mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) { return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P, (__v4df) __W, @@ -2403,7 +2447,7 @@ _mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) { __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P, (__v4df) @@ -2412,7 +2456,7 @@ _mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P, (__v2di) __W, @@ -2420,7 +2464,7 @@ _mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P, (__v2di) @@ -2429,7 +2473,7 @@ _mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P, @@ -2438,7 +2482,7 @@ _mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P, (__v4di) @@ -2447,14 +2491,14 @@ _mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) { return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P, (__v4sf) @@ -2463,14 +2507,14 @@ _mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) { return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P, (__v8sf) __W, (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P, (__v8sf) @@ -2479,7 +2523,7 @@ _mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P, (__v4si) __W, @@ -2487,7 +2531,7 @@ _mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P, (__v4si) @@ -2495,7 +2539,7 @@ _mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P, @@ -2504,7 +2548,7 @@ _mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P, (__v8si) @@ -2513,14 +2557,14 @@ _mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_expand_ps (__mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, (__v4sf) @@ -2528,14 +2572,14 @@ _mm_maskz_expand_ps (__mmask8 __U, __m128 __A) { (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, (__v8sf) __W, (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, (__v8sf) @@ -2543,14 +2587,14 @@ _mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) { (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, (__v4si) __W, (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, (__v4si) @@ -2558,14 +2602,14 @@ _mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) { (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, (__v8si) __W, (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, (__v8si) @@ -2573,7 +2617,7 @@ _mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) { (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_getexp_pd (__m128d __A) { return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, (__v2df) @@ -2581,14 +2625,14 @@ _mm_getexp_pd (__m128d __A) { (__mmask8) -1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, (__v2df) __W, (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, (__v2df) @@ -2596,7 +2640,7 @@ _mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) { (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_getexp_pd (__m256d __A) { return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, (__v4df) @@ -2604,14 +2648,14 @@ _mm256_getexp_pd (__m256d __A) { (__mmask8) -1); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, (__v4df) __W, (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, (__v4df) @@ -2619,7 +2663,7 @@ _mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) { (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_getexp_ps (__m128 __A) { return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, (__v4sf) @@ -2627,14 +2671,14 @@ _mm_getexp_ps (__m128 __A) { (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, (__v4sf) __W, (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, (__v4sf) @@ -2642,7 +2686,7 @@ _mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) { (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_getexp_ps (__m256 __A) { return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, (__v8sf) @@ -2650,14 +2694,14 @@ _mm256_getexp_ps (__m256 __A) { (__mmask8) -1); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, (__v8sf) __W, (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, (__v8sf) @@ -2665,643 +2709,579 @@ _mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_max_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_max_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_max_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_max_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_max_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_max_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_max_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_max_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_min_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_min_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_min_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_min_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_min_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_min_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_min_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_min_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)_mm_mul_pd(__A, __B), (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)_mm256_mul_pd(__A, __B), (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)_mm_mul_ps(__A, __B), (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)_mm256_mul_ps(__A, __B), (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_abs_epi32(__A), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, (__v4si)_mm_abs_epi32(__A), (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U, + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_abs_epi32(__A), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask16)__U, + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, (__v8si)_mm256_abs_epi32(__A), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_abs_epi64 (__m128i __A) { - return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_pabsq128((__v2di)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_abs_epi64(__A), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_abs_epi64(__A), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi64 (__m256i __A) { - return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_pabsq256 ((__v4di)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_abs_epi64(__A), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_abs_epi64(__A), + (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_max_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_max_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_max_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_max_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_max_epi64 (__m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_pmaxsq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epi64 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epi64(__A, __B), + (__v2di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epi64 (__m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_pmaxsq256((__v4di)__A, (__v4di)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_max_epi64 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epi64(__A, __B), + (__v4di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_max_epu32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_max_epu32(__A, __B), (__v4si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_max_epu32(__A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_max_epu32(__A, __B), (__v8si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_max_epu64 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_pmaxuq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epu64(__A, __B), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epu64(__A, __B), + (__v2di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu64 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_pmaxuq256((__v4di)__A, (__v4di)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epu64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epu64(__A, __B), + (__v4di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_min_epi32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_min_epi32(__A, __B), (__v4si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_min_epi32(__A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_min_epi32(__A, __B), (__v8si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_min_epi64 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_pminsq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epi64(__A, __B), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi64 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_pminsq256((__v4di)__A, (__v4di)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epi64(__A, __B), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_min_epu32(__A, __B), (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, (__v4si)_mm_min_epu32(__A, __B), (__v4si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_min_epu32(__A, __B), (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, (__v8si)_mm256_min_epu32(__A, __B), (__v8si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_min_epu64 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_pminuq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) { - return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, __M); +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epu64(__A, __B), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epu64(__A, __B), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu64 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_pminuq256((__v4di)__A, (__v4di)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, - __m256i __B) { - return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, __M); +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epu64(__A, __B), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epu64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -#define _mm_roundscale_pd(A, imm) __extension__ ({ \ +#define _mm_roundscale_pd(A, imm) \ (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ (int)(imm), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \ +#define _mm_mask_roundscale_pd(W, U, A, imm) \ (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ (int)(imm), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_roundscale_pd(U, A, imm) __extension__ ({ \ +#define _mm_maskz_roundscale_pd(U, A, imm) \ (__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ (int)(imm), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_roundscale_pd(A, imm) __extension__ ({ \ +#define _mm256_roundscale_pd(A, imm) \ (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ (int)(imm), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_roundscale_pd(W, U, A, imm) __extension__ ({ \ +#define _mm256_mask_roundscale_pd(W, U, A, imm) \ (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ (int)(imm), \ (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_roundscale_pd(U, A, imm) __extension__ ({ \ +#define _mm256_maskz_roundscale_pd(U, A, imm) \ (__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ (int)(imm), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_roundscale_ps(A, imm) __extension__ ({ \ +#define _mm_roundscale_ps(A, imm) \ (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_roundscale_ps(W, U, A, imm) __extension__ ({ \ +#define _mm_mask_roundscale_ps(W, U, A, imm) \ (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_roundscale_ps(U, A, imm) __extension__ ({ \ +#define _mm_maskz_roundscale_ps(U, A, imm) \ (__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_roundscale_ps(A, imm) __extension__ ({ \ +#define _mm256_roundscale_ps(A, imm) \ (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_roundscale_ps(W, U, A, imm) __extension__ ({ \ +#define _mm256_mask_roundscale_ps(W, U, A, imm) \ (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_roundscale_ps(U, A, imm) __extension__ ({ \ +#define _mm256_maskz_roundscale_ps(U, A, imm) \ (__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_scalef_pd (__m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, (__v2df) __B, @@ -3310,7 +3290,7 @@ _mm_scalef_pd (__m128d __A, __m128d __B) { (__mmask8) -1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, @@ -3319,7 +3299,7 @@ _mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, (__v2df) __B, @@ -3328,7 +3308,7 @@ _mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) { (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_scalef_pd (__m256d __A, __m256d __B) { return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, (__v4df) __B, @@ -3337,7 +3317,7 @@ _mm256_scalef_pd (__m256d __A, __m256d __B) { (__mmask8) -1); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, @@ -3346,7 +3326,7 @@ _mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, (__v4df) __B, @@ -3355,7 +3335,7 @@ _mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) { (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_scalef_ps (__m128 __A, __m128 __B) { return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -3364,7 +3344,7 @@ _mm_scalef_ps (__m128 __A, __m128 __B) { (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -3372,7 +3352,7 @@ _mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) { return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, (__v4sf) __B, @@ -3381,7 +3361,7 @@ _mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) { (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_scalef_ps (__m256 __A, __m256 __B) { return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, (__v8sf) __B, @@ -3390,7 +3370,7 @@ _mm256_scalef_ps (__m256 __A, __m256 __B) { (__mmask8) -1); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, @@ -3399,7 +3379,7 @@ _mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, (__v8sf) __B, @@ -3408,1160 +3388,1027 @@ _mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { (__mmask8) __U); } -#define _mm_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \ +#define _mm_i64scatter_pd(addr, index, v1, scale) \ __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)); }) + (__v2df)(__m128d)(v1), (int)(scale)) -#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv2df((double *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)); }) + (__v2df)(__m128d)(v1), (int)(scale)) -#define _mm_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \ +#define _mm_i64scatter_epi64(addr, index, v1, scale) \ __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)); }) + (__v2di)(__m128i)(v1), (int)(scale)) -#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv2di((long long *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)); }) + (__v2di)(__m128i)(v1), (int)(scale)) -#define _mm256_i64scatter_pd(addr, index, v1, scale) __extension__ ({ \ +#define _mm256_i64scatter_pd(addr, index, v1, scale) \ __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)); }) + (__v4df)(__m256d)(v1), (int)(scale)) -#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv4df((double *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)); }) + (__v4df)(__m256d)(v1), (int)(scale)) -#define _mm256_i64scatter_epi64(addr, index, v1, scale) __extension__ ({ \ +#define _mm256_i64scatter_epi64(addr, index, v1, scale) \ __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)); }) + (__v4di)(__m256i)(v1), (int)(scale)) -#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv4di((long long *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)); }) + (__v4di)(__m256i)(v1), (int)(scale)) -#define _mm_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \ +#define _mm_i64scatter_ps(addr, index, v1, scale) \ __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)); }) + (int)(scale)) -#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv4sf((float *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)); }) + (int)(scale)) -#define _mm_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \ +#define _mm_i64scatter_epi32(addr, index, v1, scale) \ __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)-1, \ (__v2di)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)); }) + (__v4si)(__m128i)(v1), (int)(scale)) -#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv4si((int *)(addr), (__mmask8)(mask), \ (__v2di)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)); }) + (__v4si)(__m128i)(v1), (int)(scale)) -#define _mm256_i64scatter_ps(addr, index, v1, scale) __extension__ ({ \ +#define _mm256_i64scatter_ps(addr, index, v1, scale) \ __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)); }) + (int)(scale)) -#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv8sf((float *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)); }) + (int)(scale)) -#define _mm256_i64scatter_epi32(addr, index, v1, scale) __extension__ ({ \ +#define _mm256_i64scatter_epi32(addr, index, v1, scale) \ __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)-1, \ (__v4di)(__m256i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)); }) + (__v4si)(__m128i)(v1), (int)(scale)) -#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \ +#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ __builtin_ia32_scatterdiv8si((int *)(addr), (__mmask8)(mask), \ (__v4di)(__m256i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)); }) + (__v4si)(__m128i)(v1), (int)(scale)) -#define _mm_i32scatter_pd(addr, index, v1, scale) __extension__ ({ \ +#define _mm_i32scatter_pd(addr, index, v1, scale) \ __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)-1, \ (__v4si)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)); }) - -#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)); }) - -#define _mm_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)); }) - -#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)); }) - -#define _mm256_i32scatter_pd(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)); }) - -#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)); }) - -#define _mm256_i32scatter_epi64(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)); }) - -#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)); }) - -#define _mm_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)); }) - -#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)); }) - -#define _mm_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)); }) - -#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)); }) - -#define _mm256_i32scatter_ps(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \ - (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ - (int)(scale)); }) - -#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \ - (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ - (int)(scale)); }) - -#define _mm256_i32scatter_epi32(addr, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \ - (__v8si)(__m256i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)); }) - -#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({ \ - __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \ - (__v8si)(__m256i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)); }) - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sqrt_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sqrt_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sqrt_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sqrt_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sqrt_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sqrt_ps(__A), - (__v4sf)_mm_setzero_pd()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sqrt_ps(__A), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sqrt_ps(__A), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sub_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sub_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sub_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sub_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sub_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sub_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sub_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sub_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { - return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A, - (__v4si) __I - /* idx */ , - (__v4si) __B, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I, - __mmask8 __U, __m256i __B) { - return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A, - (__v8si) __I - /* idx */ , - (__v8si) __B, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U, - __m128d __B) { - return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A, - (__v2di) __I - /* idx */ , - (__v2df) __B, - (__mmask8) - __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U, - __m256d __B) { - return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A, - (__v4di) __I - /* idx */ , - (__v4df) __B, - (__mmask8) - __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U, - __m128 __B) { - return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A, - (__v4si) __I - /* idx */ , - (__v4sf) __B, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U, - __m256 __B) { - return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A, - (__v8si) __I - /* idx */ , - (__v8sf) __B, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { - return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A, - (__v2di) __I - /* idx */ , - (__v2di) __B, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I, - __mmask8 __U, __m256i __B) { - return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A, - (__v4di) __I - /* idx */ , - (__v4di) __B, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I - /* idx */ , - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I - /* idx */ , - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I - /* idx */ , - (__v4si) __A, - (__v4si) __B, - (__mmask8) - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I - /* idx */ , - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I - /* idx */ , - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A, - __m256i __I, __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I - /* idx */ , - (__v8si) __A, - (__v8si) __B, - (__mmask8) - __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) { - return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I - /* idx */ , - (__v2df) __A, - (__v2df) __B, - (__mmask8) - - 1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I, - __m128d __B) { - return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I - /* idx */ , - (__v2df) __A, - (__v2df) __B, - (__mmask8) - __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I, - __m128d __B) { - return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I - /* idx */ , - (__v2df) __A, - (__v2df) __B, - (__mmask8) - __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) { - return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I - /* idx */ , - (__v4df) __A, - (__v4df) __B, - (__mmask8) - - 1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I, - __m256d __B) { - return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I - /* idx */ , - (__v4df) __A, - (__v4df) __B, - (__mmask8) - __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I, - __m256d __B) { - return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I - /* idx */ , - (__v4df) __A, - (__v4df) __B, - (__mmask8) - __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) { - return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I - /* idx */ , - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I, - __m128 __B) { - return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I - /* idx */ , - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I, - __m128 __B) { - return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I - /* idx */ , - (__v4sf) __A, - (__v4sf) __B, - (__mmask8) - __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) { - return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I - /* idx */ , - (__v8sf) __A, - (__v8sf) __B, - (__mmask8) -1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I, - __m256 __B) { - return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I - /* idx */ , - (__v8sf) __A, - (__v8sf) __B, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I, - __m256 __B) { - return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I - /* idx */ , - (__v8sf) __A, - (__v8sf) __B, - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I - /* idx */ , - (__v2di) __A, - (__v2di) __B, - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I - /* idx */ , - (__v2di) __A, - (__v2di) __B, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { - return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I - /* idx */ , - (__v2di) __A, - (__v2di) __B, - (__mmask8) - __U); -} - - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I - /* idx */ , - (__v4di) __A, - (__v4di) __B, - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I - /* idx */ , - (__v4di) __A, - (__v4di) __B, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A, - __m256i __I, __m256i __B) { - return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I - /* idx */ , - (__v4di) __A, - (__v4di) __B, - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi8_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi8_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi8_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi8_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi8_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi8_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi8_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi8_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi32_epi64(__X), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi32_epi64(__X), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi32_epi64(__X), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi32_epi64(__X), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi16_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi16_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi16_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi16_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi16_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi16_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi16_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi16_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu8_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu8_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu8_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu8_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu8_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu8_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu8_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu8_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu32_epi64(__X), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu32_epi64(__X), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu32_epi64(__X), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu32_epi64(__X), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu16_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu16_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu16_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu16_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu16_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu16_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu16_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu16_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - - -#define _mm_rol_epi32(a, b) __extension__ ({\ - (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)-1); }) - -#define _mm_mask_rol_epi32(w, u, a, b) __extension__ ({\ - (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \ - (__v4si)(__m128i)(w), (__mmask8)(u)); }) - -#define _mm_maskz_rol_epi32(u, a, b) __extension__ ({\ - (__m128i)__builtin_ia32_prold128_mask((__v4si)(__m128i)(a), (int)(b), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(u)); }) - -#define _mm256_rol_epi32(a, b) __extension__ ({\ - (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) - -#define _mm256_mask_rol_epi32(w, u, a, b) __extension__ ({\ - (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \ - (__v8si)(__m256i)(w), (__mmask8)(u)); }) - -#define _mm256_maskz_rol_epi32(u, a, b) __extension__ ({\ - (__m256i)__builtin_ia32_prold256_mask((__v8si)(__m256i)(a), (int)(b), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(u)); }) - -#define _mm_rol_epi64(a, b) __extension__ ({\ - (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)-1); }) - -#define _mm_mask_rol_epi64(w, u, a, b) __extension__ ({\ - (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \ - (__v2di)(__m128i)(w), (__mmask8)(u)); }) - -#define _mm_maskz_rol_epi64(u, a, b) __extension__ ({\ - (__m128i)__builtin_ia32_prolq128_mask((__v2di)(__m128i)(a), (int)(b), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)(u)); }) - -#define _mm256_rol_epi64(a, b) __extension__ ({\ - (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) - -#define _mm256_mask_rol_epi64(w, u, a, b) __extension__ ({\ - (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \ - (__v4di)(__m256i)(w), (__mmask8)(u)); }) - -#define _mm256_maskz_rol_epi64(u, a, b) __extension__ ({\ - (__m256i)__builtin_ia32_prolq256_mask((__v4di)(__m256i)(a), (int)(b), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(u)); }) - -static __inline__ __m128i __DEFAULT_FN_ATTRS + (__v2df)(__m128d)(v1), (int)(scale)) + +#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv2df((double *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v2df)(__m128d)(v1), (int)(scale)) + +#define _mm_i32scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v2di)(__m128i)(v1), (int)(scale)) + +#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv2di((long long *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v2di)(__m128i)(v1), (int)(scale)) + +#define _mm256_i32scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v4df)(__m256d)(v1), (int)(scale)) + +#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4df((double *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v4df)(__m256d)(v1), (int)(scale)) + +#define _mm256_i32scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v4di)(__m256i)(v1), (int)(scale)) + +#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4di((long long *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v4di)(__m256i)(v1), (int)(scale)) + +#define _mm_i32scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4sf((float *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm_i32scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4si((int *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm256_i32scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)-1, \ + (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ + (int)(scale)) + +#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv8sf((float *)(addr), (__mmask8)(mask), \ + (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ + (int)(scale)) + +#define _mm256_i32scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)-1, \ + (__v8si)(__m256i)(index), \ + (__v8si)(__m256i)(v1), (int)(scale)) + +#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv8si((int *)(addr), (__mmask8)(mask), \ + (__v8si)(__m256i)(index), \ + (__v8si)(__m256i)(v1), (int)(scale)) + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sqrt_pd(__A), + (__v2df)__W); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sqrt_pd(__A), + (__v2df)_mm_setzero_pd()); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sqrt_pd(__A), + (__v4df)__W); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sqrt_pd(__A), + (__v4df)_mm256_setzero_pd()); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sqrt_ps(__A), + (__v4sf)__W); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sqrt_ps(__A), + (__v4sf)_mm_setzero_ps()); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)__W); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)_mm256_setzero_ps()); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sub_pd(__A, __B), + (__v2df)__W); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sub_pd(__A, __B), + (__v2df)_mm_setzero_pd()); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sub_pd(__A, __B), + (__v4df)__W); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sub_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)__W); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)__W); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, + (__v4si)__B); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)__A); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)__I); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, + (__v8si) __B); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)__A); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)__I); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, + (__v2df)__B); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128(__U, + (__v2df)_mm_permutex2var_pd(__A, __I, __B), + (__v2df)__A); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128(__U, + (__v2df)_mm_permutex2var_pd(__A, __I, __B), + (__v2df)(__m128d)__I); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128(__U, + (__v2df)_mm_permutex2var_pd(__A, __I, __B), + (__v2df)_mm_setzero_pd()); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { + return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, + (__v4df)__B); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, + __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256(__U, + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)__A); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, + __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256(__U, + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)(__m256d)__I); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, + __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256(__U, + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)_mm256_setzero_pd()); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, + (__v4sf)__B); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128(__U, + (__v4sf)_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)__A); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128(__U, + (__v4sf)_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)(__m128)__I); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128(__U, + (__v4sf)_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)_mm_setzero_ps()); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { + return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, + (__v8sf) __B); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256(__U, + (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)__A); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, + __m256 __B) { + return (__m256)__builtin_ia32_selectps_256(__U, + (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)(__m256)__I); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, + __m256 __B) { + return (__m256)__builtin_ia32_selectps_256(__U, + (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)_mm256_setzero_ps()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, + (__v2di)__B); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)__A); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)__I); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)_mm_setzero_si128()); + } + + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, + (__v4di) __B); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)__A); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)__I); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + +#define _mm_rol_epi32(a, b) \ + (__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)) + +#define _mm_mask_rol_epi32(w, u, a, b) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)(__m128i)(w)) + +#define _mm_maskz_rol_epi32(u, a, b) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128()) + +#define _mm256_rol_epi32(a, b) \ + (__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)) + +#define _mm256_mask_rol_epi32(w, u, a, b) \ + (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)(__m256i)(w)) + +#define _mm256_maskz_rol_epi32(u, a, b) \ + (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256()) + +#define _mm_rol_epi64(a, b) \ + (__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)) + +#define _mm_mask_rol_epi64(w, u, a, b) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)(__m128i)(w)) + +#define _mm_maskz_rol_epi64(u, a, b) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128()) + +#define _mm256_rol_epi64(a, b) \ + (__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)) + +#define _mm256_mask_rol_epi64(w, u, a, b) \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)(__m256i)(w)) + +#define _mm256_maskz_rol_epi64(u, a, b) \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256()) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rolv_epi32 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rolv_epi32(__A, __B), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rolv_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rolv_epi32 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rolv_epi32(__A, __B), + (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rolv_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rolv_epi64 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rolv_epi64(__A, __B), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rolv_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rolv_epi64 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rolv_epi64(__A, __B), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rolv_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -#define _mm_ror_epi32(A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)-1); }) +#define _mm_ror_epi32(a, b) \ + (__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)) -#define _mm_mask_ror_epi32(W, U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)(__m128i)(W), (__mmask8)(U)); }) +#define _mm_mask_ror_epi32(w, u, a, b) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)(__m128i)(w)) -#define _mm_maskz_ror_epi32(U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_prord128_mask((__v4si)(__m128i)(A), (int)(B), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U)); }) +#define _mm_maskz_ror_epi32(u, a, b) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128()) -#define _mm256_ror_epi32(A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1); }) +#define _mm256_ror_epi32(a, b) \ + (__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)) -#define _mm256_mask_ror_epi32(W, U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)(__m256i)(W), (__mmask8)(U)); }) +#define _mm256_mask_ror_epi32(w, u, a, b) \ + (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)(__m256i)(w)) -#define _mm256_maskz_ror_epi32(U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_prord256_mask((__v8si)(__m256i)(A), (int)(B), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +#define _mm256_maskz_ror_epi32(u, a, b) \ + (__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256()) -#define _mm_ror_epi64(A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)-1); }) +#define _mm_ror_epi64(a, b) \ + (__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)) -#define _mm_mask_ror_epi64(W, U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)(__m128i)(W), (__mmask8)(U)); }) +#define _mm_mask_ror_epi64(w, u, a, b) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)(__m128i)(w)) -#define _mm_maskz_ror_epi64(U, A, B) __extension__ ({ \ - (__m128i)__builtin_ia32_prorq128_mask((__v2di)(__m128i)(A), (int)(B), \ - (__v2di)_mm_setzero_di(), \ - (__mmask8)(U)); }) +#define _mm_maskz_ror_epi64(u, a, b) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128()) -#define _mm256_ror_epi64(A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)-1); }) +#define _mm256_ror_epi64(a, b) \ + (__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)) -#define _mm256_mask_ror_epi64(W, U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)(__m256i)(W), (__mmask8)(U)); }) +#define _mm256_mask_ror_epi64(w, u, a, b) \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)(__m256i)(w)) -#define _mm256_maskz_ror_epi64(U, A, B) __extension__ ({ \ - (__m256i)__builtin_ia32_prorq256_mask((__v4di)(__m256i)(A), (int)(B), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U)); }) +#define _mm256_maskz_ror_epi64(u, a, b) \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256()) -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4569,7 +4416,7 @@ _mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4577,7 +4424,7 @@ _mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4585,7 +4432,7 @@ _mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4593,7 +4440,7 @@ _mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4601,7 +4448,7 @@ _mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4609,7 +4456,7 @@ _mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, int __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4617,7 +4464,7 @@ _mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4625,7 +4472,7 @@ _mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, int __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4633,15 +4480,15 @@ _mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_sll_epi64(__A, __B), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4649,7 +4496,7 @@ _mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4657,7 +4504,7 @@ _mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4665,15 +4512,15 @@ _mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_slli_epi64(__A, __B), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4681,7 +4528,7 @@ _mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4689,127 +4536,95 @@ _mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, int __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rorv_epi32 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rorv_epi32(__A, __B), + (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, - (__v4si) __B, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rorv_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rorv_epi32 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rorv_epi32(__A, __B), + (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, - (__v8si) __B, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rorv_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_rorv_epi64 (__m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) -1); + return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) __W, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rorv_epi64(__A, __B), + (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, - (__v2di) __B, - (__v2di) - _mm_setzero_di (), - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rorv_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_rorv_epi64 (__m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, - __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) __W, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rorv_epi64(__A, __B), + (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, - (__v4di) __B, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rorv_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4817,15 +4632,15 @@ _mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_sllv_epi64(__X, __Y), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4833,7 +4648,7 @@ _mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4841,7 +4656,7 @@ _mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4849,7 +4664,7 @@ _mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4857,7 +4672,7 @@ _mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4865,7 +4680,7 @@ _mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4873,7 +4688,7 @@ _mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -4881,15 +4696,15 @@ _mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srlv_epi64(__X, __Y), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4897,7 +4712,7 @@ _mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -4905,7 +4720,7 @@ _mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4913,7 +4728,7 @@ _mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4921,7 +4736,7 @@ _mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4929,7 +4744,7 @@ _mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4937,7 +4752,7 @@ _mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4945,7 +4760,7 @@ _mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4953,7 +4768,7 @@ _mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4961,7 +4776,7 @@ _mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4969,7 +4784,7 @@ _mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4977,7 +4792,7 @@ _mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -4985,7 +4800,7 @@ _mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, int __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -4993,7 +4808,7 @@ _mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -5001,7 +4816,7 @@ _mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, int __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -5009,15 +4824,15 @@ _mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srl_epi64(__A, __B), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -5025,7 +4840,7 @@ _mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -5033,7 +4848,7 @@ _mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -5041,15 +4856,15 @@ _mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srli_epi64(__A, __B), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -5057,7 +4872,7 @@ _mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -5065,7 +4880,7 @@ _mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, int __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -5073,7 +4888,7 @@ _mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -5081,7 +4896,7 @@ _mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -5089,7 +4904,7 @@ _mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -5097,13 +4912,13 @@ _mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srav_epi64(__m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -5111,21 +4926,21 @@ _mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_srav_epi64(__X, __Y), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srav_epi64(__m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -5133,7 +4948,7 @@ _mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -5141,7 +4956,7 @@ _mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, @@ -5149,7 +4964,7 @@ _mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) (__v4si) __W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, @@ -5158,7 +4973,7 @@ _mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, @@ -5166,7 +4981,7 @@ _mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) (__v8si) __W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, @@ -5174,7 +4989,7 @@ _mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) (__v8si) _mm256_setzero_si256 ()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, @@ -5183,7 +4998,7 @@ _mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_load_epi32 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, @@ -5193,7 +5008,7 @@ _mm_maskz_load_epi32 (__mmask8 __U, void const *__P) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, @@ -5202,7 +5017,7 @@ _mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, @@ -5212,7 +5027,7 @@ _mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, @@ -5220,7 +5035,7 @@ _mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, @@ -5228,7 +5043,7 @@ _mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, @@ -5236,15 +5051,15 @@ _mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) (__v2di) __W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) { return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, (__v2di) __A, - (__v2di) _mm_setzero_di ()); + (__v2di) _mm_setzero_si128 ()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, @@ -5252,7 +5067,7 @@ _mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) (__v4di) __W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) { return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, @@ -5260,7 +5075,7 @@ _mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) (__v4di) _mm256_setzero_si256 ()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, @@ -5269,17 +5084,17 @@ _mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_load_epi64 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, (__v2di) - _mm_setzero_di (), + _mm_setzero_si128 (), (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, @@ -5288,7 +5103,7 @@ _mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, @@ -5298,7 +5113,7 @@ _mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, @@ -5306,7 +5121,7 @@ _mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, @@ -5314,7 +5129,7 @@ _mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5322,7 +5137,7 @@ _mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5330,7 +5145,7 @@ _mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5338,7 +5153,7 @@ _mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5346,7 +5161,7 @@ _mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -5354,7 +5169,7 @@ _mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) (__v4si)__O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi32( __mmask8 __M, int __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -5362,7 +5177,7 @@ _mm_maskz_set1_epi32( __mmask8 __M, int __A) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -5370,7 +5185,7 @@ _mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) (__v8si)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi32( __mmask8 __M, int __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -5379,8 +5194,7 @@ _mm256_maskz_set1_epi32( __mmask8 __M, int __A) } -#ifdef __x86_64__ -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, @@ -5388,7 +5202,7 @@ _mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) (__v2di) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m128i) __builtin_ia32_selectq_128(__M, @@ -5396,7 +5210,7 @@ _mm_maskz_set1_epi64 (__mmask8 __M, long long __A) (__v2di) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) { return (__m256i) __builtin_ia32_selectq_256(__M, @@ -5404,89 +5218,87 @@ _mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) (__v4di) __O) ; } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) { return (__m256i) __builtin_ia32_selectq_256(__M, (__v4di) _mm256_set1_epi64x(__A), (__v4di) _mm256_setzero_si256()); } - -#endif -#define _mm_fixupimm_pd(A, B, C, imm) __extension__ ({ \ +#define _mm_fixupimm_pd(A, B, C, imm) \ (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \ +#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \ (__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \ +#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \ (__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \ (__v2df)(__m128d)(B), \ (__v2di)(__m128i)(C), \ - (int)(imm), (__mmask8)(U)); }) + (int)(imm), (__mmask8)(U)) -#define _mm256_fixupimm_pd(A, B, C, imm) __extension__ ({ \ +#define _mm256_fixupimm_pd(A, B, C, imm) \ (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), \ (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \ +#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \ (__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), \ (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \ +#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \ (__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \ (__v4df)(__m256d)(B), \ (__v4di)(__m256i)(C), \ - (int)(imm), (__mmask8)(U)); }) + (int)(imm), (__mmask8)(U)) -#define _mm_fixupimm_ps(A, B, C, imm) __extension__ ({ \ +#define _mm_fixupimm_ps(A, B, C, imm) \ (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \ +#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \ (__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \ +#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \ (__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \ (__v4sf)(__m128)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_fixupimm_ps(A, B, C, imm) __extension__ ({ \ +#define _mm256_fixupimm_ps(A, B, C, imm) \ (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \ +#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \ (__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \ +#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \ (__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \ (__v8sf)(__m256)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) { return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, @@ -5494,7 +5306,7 @@ _mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_load_pd (__mmask8 __U, void const *__P) { return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, @@ -5503,7 +5315,7 @@ _mm_maskz_load_pd (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) { return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, @@ -5511,7 +5323,7 @@ _mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_load_pd (__mmask8 __U, void const *__P) { return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, @@ -5520,7 +5332,7 @@ _mm256_maskz_load_pd (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) { return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, @@ -5528,7 +5340,7 @@ _mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_load_ps (__mmask8 __U, void const *__P) { return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, @@ -5537,7 +5349,7 @@ _mm_maskz_load_ps (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) { return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, @@ -5545,7 +5357,7 @@ _mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_load_ps (__mmask8 __U, void const *__P) { return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, @@ -5554,7 +5366,7 @@ _mm256_maskz_load_ps (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P, @@ -5562,7 +5374,7 @@ _mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddqudi128_mask ((__v2di *) __P, @@ -5571,7 +5383,7 @@ _mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P, @@ -5579,7 +5391,7 @@ _mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddqudi256_mask ((__v4di *) __P, @@ -5588,7 +5400,7 @@ _mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P, @@ -5596,7 +5408,7 @@ _mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_loaddqusi128_mask ((__v4si *) __P, @@ -5605,7 +5417,7 @@ _mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P, @@ -5613,7 +5425,7 @@ _mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) { return (__m256i) __builtin_ia32_loaddqusi256_mask ((__v8si *) __P, @@ -5622,7 +5434,7 @@ _mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) { return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P, @@ -5630,7 +5442,7 @@ _mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_loadu_pd (__mmask8 __U, void const *__P) { return (__m128d) __builtin_ia32_loadupd128_mask ((__v2df *) __P, @@ -5639,7 +5451,7 @@ _mm_maskz_loadu_pd (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) { return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P, @@ -5647,7 +5459,7 @@ _mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) { return (__m256d) __builtin_ia32_loadupd256_mask ((__v4df *) __P, @@ -5656,7 +5468,7 @@ _mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) { return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P, @@ -5664,7 +5476,7 @@ _mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_loadu_ps (__mmask8 __U, void const *__P) { return (__m128) __builtin_ia32_loadups128_mask ((__v4sf *) __P, @@ -5673,7 +5485,7 @@ _mm_maskz_loadu_ps (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) { return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P, @@ -5681,7 +5493,7 @@ _mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) { return (__m256) __builtin_ia32_loadups256_mask ((__v8sf *) __P, @@ -5690,7 +5502,7 @@ _mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_storeapd128_mask ((__v2df *) __P, @@ -5698,7 +5510,7 @@ _mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) { __builtin_ia32_storeapd256_mask ((__v4df *) __P, @@ -5706,7 +5518,7 @@ _mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_storeaps128_mask ((__v4sf *) __P, @@ -5714,7 +5526,7 @@ _mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) { __builtin_ia32_storeaps256_mask ((__v8sf *) __P, @@ -5722,7 +5534,7 @@ _mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedqudi128_mask ((__v2di *) __P, @@ -5730,7 +5542,7 @@ _mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_storedqudi256_mask ((__v4di *) __P, @@ -5738,7 +5550,7 @@ _mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) { __builtin_ia32_storedqusi128_mask ((__v4si *) __P, @@ -5746,7 +5558,7 @@ _mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) { __builtin_ia32_storedqusi256_mask ((__v8si *) __P, @@ -5754,7 +5566,7 @@ _mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) { __builtin_ia32_storeupd128_mask ((__v2df *) __P, @@ -5762,7 +5574,7 @@ _mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) { __builtin_ia32_storeupd256_mask ((__v4df *) __P, @@ -5770,7 +5582,7 @@ _mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) { __builtin_ia32_storeups128_mask ((__v4sf *) __P, @@ -5778,7 +5590,7 @@ _mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) { __builtin_ia32_storeups256_mask ((__v8sf *) __P, @@ -5787,7 +5599,7 @@ _mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5795,7 +5607,7 @@ _mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5803,7 +5615,7 @@ _mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5811,7 +5623,7 @@ _mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5819,7 +5631,7 @@ _mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5827,7 +5639,7 @@ _mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5835,7 +5647,7 @@ _mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5843,7 +5655,7 @@ _mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5851,7 +5663,7 @@ _mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5859,7 +5671,7 @@ _mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -5867,7 +5679,7 @@ _mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5875,7 +5687,7 @@ _mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -5883,7 +5695,7 @@ _mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5891,7 +5703,7 @@ _mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -5899,7 +5711,7 @@ _mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5907,7 +5719,7 @@ _mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -5915,7 +5727,7 @@ _mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_rcp14_pd (__m128d __A) { return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, @@ -5924,7 +5736,7 @@ _mm_rcp14_pd (__m128d __A) (__mmask8) -1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, @@ -5932,7 +5744,7 @@ _mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, @@ -5941,7 +5753,7 @@ _mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_rcp14_pd (__m256d __A) { return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, @@ -5950,7 +5762,7 @@ _mm256_rcp14_pd (__m256d __A) (__mmask8) -1); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, @@ -5958,7 +5770,7 @@ _mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, @@ -5967,7 +5779,7 @@ _mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_rcp14_ps (__m128 __A) { return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, @@ -5976,7 +5788,7 @@ _mm_rcp14_ps (__m128 __A) (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, @@ -5984,7 +5796,7 @@ _mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, @@ -5993,7 +5805,7 @@ _mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_rcp14_ps (__m256 __A) { return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, @@ -6002,7 +5814,7 @@ _mm256_rcp14_ps (__m256 __A) (__mmask8) -1); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, @@ -6010,7 +5822,7 @@ _mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, @@ -6019,47 +5831,47 @@ _mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) (__mmask8) __U); } -#define _mm_mask_permute_pd(W, U, X, C) __extension__ ({ \ +#define _mm_mask_permute_pd(W, U, X, C) \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)(__m128d)(W)); }) + (__v2df)(__m128d)(W)) -#define _mm_maskz_permute_pd(U, X, C) __extension__ ({ \ +#define _mm_maskz_permute_pd(U, X, C) \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)_mm_setzero_pd()); }) + (__v2df)_mm_setzero_pd()) -#define _mm256_mask_permute_pd(W, U, X, C) __extension__ ({ \ +#define _mm256_mask_permute_pd(W, U, X, C) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)(__m256d)(W)); }) + (__v4df)(__m256d)(W)) -#define _mm256_maskz_permute_pd(U, X, C) __extension__ ({ \ +#define _mm256_maskz_permute_pd(U, X, C) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd()); }) + (__v4df)_mm256_setzero_pd()) -#define _mm_mask_permute_ps(W, U, X, C) __extension__ ({ \ +#define _mm_mask_permute_ps(W, U, X, C) \ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)(__m128)(W)); }) + (__v4sf)(__m128)(W)) -#define _mm_maskz_permute_ps(U, X, C) __extension__ ({ \ +#define _mm_maskz_permute_ps(U, X, C) \ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)_mm_setzero_ps()); }) + (__v4sf)_mm_setzero_ps()) -#define _mm256_mask_permute_ps(W, U, X, C) __extension__ ({ \ +#define _mm256_mask_permute_ps(W, U, X, C) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)(__m256)(W)); }) + (__v8sf)(__m256)(W)) -#define _mm256_maskz_permute_ps(U, X, C) __extension__ ({ \ +#define _mm256_maskz_permute_ps(U, X, C) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)_mm256_setzero_ps()); }) + (__v8sf)_mm256_setzero_ps()) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -6067,7 +5879,7 @@ _mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) { return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, @@ -6075,7 +5887,7 @@ _mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -6083,7 +5895,7 @@ _mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) { return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, @@ -6091,7 +5903,7 @@ _mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -6099,7 +5911,7 @@ _mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -6107,7 +5919,7 @@ _mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -6115,7 +5927,7 @@ _mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -6123,115 +5935,115 @@ _mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_test_epi32_mask (__m128i __A, __m128i __B) { - return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); + return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_di()); + _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_test_epi32_mask (__m256i __A, __m256i __B) { return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_test_epi64_mask (__m128i __A, __m128i __B) { - return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); + return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_di()); + _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_test_epi64_mask (__m256i __A, __m256i __B) { return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_testn_epi32_mask (__m128i __A, __m128i __B) { - return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); + return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_di()); + _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_testn_epi32_mask (__m256i __A, __m256i __B) { return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_testn_epi64_mask (__m128i __A, __m128i __B) { - return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_di()); + return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 _mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) { return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_di()); + _mm_setzero_si128()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_testn_epi64_mask (__m256i __A, __m256i __B) { return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __mmask8 __DEFAULT_FN_ATTRS +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 _mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) { return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B), _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6239,7 +6051,7 @@ _mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6247,7 +6059,7 @@ _mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6255,7 +6067,7 @@ _mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6263,7 +6075,7 @@ _mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -6271,15 +6083,15 @@ _mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_unpackhi_epi64(__A, __B), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6287,7 +6099,7 @@ _mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6295,7 +6107,7 @@ _mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6303,7 +6115,7 @@ _mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6311,7 +6123,7 @@ _mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6319,7 +6131,7 @@ _mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6327,7 +6139,7 @@ _mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, @@ -6335,15 +6147,15 @@ _mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, (__v2di)_mm_unpacklo_epi64(__A, __B), - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6351,7 +6163,7 @@ _mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, @@ -6359,7 +6171,7 @@ _mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6367,7 +6179,7 @@ _mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6375,7 +6187,7 @@ _mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6383,7 +6195,7 @@ _mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6391,7 +6203,7 @@ _mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6399,7 +6211,7 @@ _mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, int __B) (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B) { return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, @@ -6407,7 +6219,7 @@ _mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, int __B) (__v4si)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6415,7 +6227,7 @@ _mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, int __B) (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, @@ -6423,13 +6235,13 @@ _mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, int __B) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_sra_epi64(__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ @@ -6437,21 +6249,21 @@ _mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_sra_epi64(__A, __B), \ - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sra_epi64(__m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ @@ -6459,7 +6271,7 @@ _mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ @@ -6467,13 +6279,13 @@ _mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) (__v4di)_mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_srai_epi64(__m128i __A, int __imm) { return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ @@ -6481,21 +6293,21 @@ _mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, int __imm) (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, int __imm) { return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ (__v2di)_mm_srai_epi64(__A, __imm), \ - (__v2di)_mm_setzero_di()); + (__v2di)_mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi64(__m256i __A, int __imm) { return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ @@ -6503,7 +6315,7 @@ _mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, int __imm) (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm) { return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ @@ -6511,198 +6323,178 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, int __imm) (__v4di)_mm256_setzero_si256()); } -#define _mm_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \ +#define _mm_ternarylogic_epi32(A, B, C, imm) \ (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \ +#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \ (__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \ +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \ (__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \ (__v4si)(__m128i)(B), \ (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \ +#define _mm256_ternarylogic_epi32(A, B, C, imm) \ (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \ +#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \ (__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \ +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \ (__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \ (__v8si)(__m256i)(B), \ (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \ +#define _mm_ternarylogic_epi64(A, B, C, imm) \ (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \ +#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \ (__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \ +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \ (__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \ (__v2di)(__m128i)(B), \ (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \ +#define _mm256_ternarylogic_epi64(A, B, C, imm) \ (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), \ (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \ +#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \ (__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), \ (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \ +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \ (__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), \ (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_shuffle_f32x4(A, B, imm) __extension__ ({ \ - (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), \ - 0 + ((((imm) >> 0) & 0x1) * 4), \ - 1 + ((((imm) >> 0) & 0x1) * 4), \ - 2 + ((((imm) >> 0) & 0x1) * 4), \ - 3 + ((((imm) >> 0) & 0x1) * 4), \ - 8 + ((((imm) >> 1) & 0x1) * 4), \ - 9 + ((((imm) >> 1) & 0x1) * 4), \ - 10 + ((((imm) >> 1) & 0x1) * 4), \ - 11 + ((((imm) >> 1) & 0x1) * 4)); }) +#define _mm256_shuffle_f32x4(A, B, imm) \ + (__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(imm)) -#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W)); }) + (__v8sf)(__m256)(W)) -#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps()); }) + (__v8sf)_mm256_setzero_ps()) -#define _mm256_shuffle_f64x2(A, B, imm) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - 0 + ((((imm) >> 0) & 0x1) * 2), \ - 1 + ((((imm) >> 0) & 0x1) * 2), \ - 4 + ((((imm) >> 1) & 0x1) * 2), \ - 5 + ((((imm) >> 1) & 0x1) * 2)); }) +#define _mm256_shuffle_f64x2(A, B, imm) \ + (__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(imm)) -#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)(__m256)(W)); }) + (__v4df)(__m256d)(W)) -#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd()); }) + (__v4df)_mm256_setzero_pd()) -#define _mm256_shuffle_i32x4(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - 0 + ((((imm) >> 0) & 0x1) * 2), \ - 1 + ((((imm) >> 0) & 0x1) * 2), \ - 4 + ((((imm) >> 1) & 0x1) * 2), \ - 5 + ((((imm) >> 1) & 0x1) * 2)); }) +#define _mm256_shuffle_i32x4(A, B, imm) \ + (__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(imm)) -#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)(__m256)(W)); }) + (__v8si)(__m256i)(W)) -#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256()); }) + (__v8si)_mm256_setzero_si256()) -#define _mm256_shuffle_i64x2(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - 0 + ((((imm) >> 0) & 0x1) * 2), \ - 1 + ((((imm) >> 0) & 0x1) * 2), \ - 4 + ((((imm) >> 1) & 0x1) * 2), \ - 5 + ((((imm) >> 1) & 0x1) * 2)); }) +#define _mm256_shuffle_i64x2(A, B, imm) \ + (__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(imm)) -#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)(__m256)(W)); }) + (__v4di)(__m256i)(W)) -#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256()); }) + (__v4di)_mm256_setzero_si256()) -#define _mm_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \ +#define _mm_mask_shuffle_pd(W, U, A, B, M) \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)(__m128d)(W)); }) + (__v2df)(__m128d)(W)) -#define _mm_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \ +#define _mm_maskz_shuffle_pd(U, A, B, M) \ (__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)_mm_setzero_pd()); }) + (__v2df)_mm_setzero_pd()) -#define _mm256_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \ +#define _mm256_mask_shuffle_pd(W, U, A, B, M) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)(__m256d)(W)); }) + (__v4df)(__m256d)(W)) -#define _mm256_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \ +#define _mm256_maskz_shuffle_pd(U, A, B, M) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)_mm256_setzero_pd()); }) + (__v4df)_mm256_setzero_pd()) -#define _mm_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \ +#define _mm_mask_shuffle_ps(W, U, A, B, M) \ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)(__m128)(W)); }) + (__v4sf)(__m128)(W)) -#define _mm_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \ +#define _mm_maskz_shuffle_ps(U, A, B, M) \ (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)_mm_setzero_ps()); }) + (__v4sf)_mm_setzero_ps()) -#define _mm256_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \ +#define _mm256_mask_shuffle_ps(W, U, A, B, M) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)(__m256)(W)); }) + (__v8sf)(__m256)(W)) -#define _mm256_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \ +#define _mm256_maskz_shuffle_ps(U, A, B, M) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)_mm256_setzero_ps()); }) + (__v8sf)_mm256_setzero_ps()) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_rsqrt14_pd (__m128d __A) { return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, @@ -6711,7 +6503,7 @@ _mm_rsqrt14_pd (__m128d __A) (__mmask8) -1); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, @@ -6719,7 +6511,7 @@ _mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) (__mmask8) __U); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, @@ -6728,7 +6520,7 @@ _mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_rsqrt14_pd (__m256d __A) { return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, @@ -6737,7 +6529,7 @@ _mm256_rsqrt14_pd (__m256d __A) (__mmask8) -1); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, @@ -6745,7 +6537,7 @@ _mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) (__mmask8) __U); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, @@ -6754,7 +6546,7 @@ _mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_rsqrt14_ps (__m128 __A) { return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, @@ -6763,7 +6555,7 @@ _mm_rsqrt14_ps (__m128 __A) (__mmask8) -1); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, @@ -6771,7 +6563,7 @@ _mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, @@ -6780,7 +6572,7 @@ _mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_rsqrt14_ps (__m256 __A) { return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, @@ -6789,7 +6581,7 @@ _mm256_rsqrt14_ps (__m256 __A) (__mmask8) -1); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, @@ -6797,7 +6589,7 @@ _mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, @@ -6806,14 +6598,14 @@ _mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_broadcast_f32x4(__m128 __A) { return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -6821,7 +6613,7 @@ _mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) (__v8sf)__O); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, @@ -6829,14 +6621,14 @@ _mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_broadcast_i32x4(__m128i __A) { return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, 0, 1, 2, 3, 0, 1, 2, 3); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -6844,7 +6636,7 @@ _mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) (__v8si)__O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, @@ -6852,7 +6644,7 @@ _mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256(__M, @@ -6860,7 +6652,7 @@ _mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) (__v4df) __O); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) { return (__m256d)__builtin_ia32_selectpd_256(__M, @@ -6868,7 +6660,7 @@ _mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) (__v4df) _mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_selectps_128(__M, @@ -6876,7 +6668,7 @@ _mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) (__v4sf) __O); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { return (__m128)__builtin_ia32_selectps_128(__M, @@ -6884,7 +6676,7 @@ _mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) (__v4sf) _mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256(__M, @@ -6892,7 +6684,7 @@ _mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) (__v8sf) __O); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) { return (__m256)__builtin_ia32_selectps_256(__M, @@ -6900,7 +6692,7 @@ _mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) (__v8sf) _mm256_setzero_ps()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -6908,7 +6700,7 @@ _mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) (__v4si) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128(__M, @@ -6916,7 +6708,7 @@ _mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) (__v4si) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -6924,7 +6716,7 @@ _mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) (__v8si) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectd_256(__M, @@ -6932,7 +6724,7 @@ _mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) (__v8si) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128(__M, @@ -6940,7 +6732,7 @@ _mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) (__v2di) __O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128(__M, @@ -6948,7 +6740,7 @@ _mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) (__v2di) _mm_setzero_si128()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256(__M, @@ -6956,7 +6748,7 @@ _mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) (__v4di) __O); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) { return (__m256i)__builtin_ia32_selectq_256(__M, @@ -6964,7 +6756,7 @@ _mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) (__v4di) _mm256_setzero_si256()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsepi32_epi8 (__m128i __A) { return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, @@ -6972,14 +6764,14 @@ _mm_cvtsepi32_epi8 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, @@ -6987,13 +6779,13 @@ _mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm256_cvtsepi32_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, @@ -7001,14 +6793,14 @@ _mm256_cvtsepi32_epi8 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, @@ -7016,13 +6808,13 @@ _mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsepi32_epi16 (__m128i __A) { return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, @@ -7030,7 +6822,7 @@ _mm_cvtsepi32_epi16 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, @@ -7038,7 +6830,7 @@ _mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, @@ -7046,13 +6838,13 @@ _mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsepi32_epi16 (__m256i __A) { return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, @@ -7060,14 +6852,14 @@ _mm256_cvtsepi32_epi16 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, @@ -7075,13 +6867,13 @@ _mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsepi64_epi8 (__m128i __A) { return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, @@ -7089,14 +6881,14 @@ _mm_cvtsepi64_epi8 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, @@ -7104,13 +6896,13 @@ _mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsepi64_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, @@ -7118,14 +6910,14 @@ _mm256_cvtsepi64_epi8 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, @@ -7133,13 +6925,13 @@ _mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsepi64_epi32 (__m128i __A) { return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, @@ -7147,14 +6939,14 @@ _mm_cvtsepi64_epi32 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, (__v4si) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, @@ -7162,13 +6954,13 @@ _mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsepi64_epi32 (__m256i __A) { return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, @@ -7176,7 +6968,7 @@ _mm256_cvtsepi64_epi32 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, @@ -7184,7 +6976,7 @@ _mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, @@ -7192,13 +6984,13 @@ _mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsepi64_epi16 (__m128i __A) { return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, @@ -7206,14 +6998,14 @@ _mm_cvtsepi64_epi16 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, @@ -7221,13 +7013,13 @@ _mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtsepi64_epi16 (__m256i __A) { return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, @@ -7235,14 +7027,14 @@ _mm256_cvtsepi64_epi16 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, @@ -7250,13 +7042,13 @@ _mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtusepi32_epi8 (__m128i __A) { return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, @@ -7264,7 +7056,7 @@ _mm_cvtusepi32_epi8 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, @@ -7272,7 +7064,7 @@ _mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, @@ -7280,13 +7072,13 @@ _mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtusepi32_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, @@ -7294,7 +7086,7 @@ _mm256_cvtusepi32_epi8 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, @@ -7302,7 +7094,7 @@ _mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, @@ -7310,13 +7102,13 @@ _mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtusepi32_epi16 (__m128i __A) { return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, @@ -7324,14 +7116,14 @@ _mm_cvtusepi32_epi16 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, @@ -7339,13 +7131,13 @@ _mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtusepi32_epi16 (__m256i __A) { return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, @@ -7353,14 +7145,14 @@ _mm256_cvtusepi32_epi16 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, @@ -7368,13 +7160,13 @@ _mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtusepi64_epi8 (__m128i __A) { return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, @@ -7382,7 +7174,7 @@ _mm_cvtusepi64_epi8 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, @@ -7390,7 +7182,7 @@ _mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, @@ -7398,13 +7190,13 @@ _mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtusepi64_epi8 (__m256i __A) { return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, @@ -7412,7 +7204,7 @@ _mm256_cvtusepi64_epi8 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, @@ -7420,7 +7212,7 @@ _mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, @@ -7428,13 +7220,13 @@ _mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtusepi64_epi32 (__m128i __A) { return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, @@ -7442,14 +7234,14 @@ _mm_cvtusepi64_epi32 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, (__v4si) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, @@ -7457,13 +7249,13 @@ _mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtusepi64_epi32 (__m256i __A) { return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, @@ -7471,14 +7263,14 @@ _mm256_cvtusepi64_epi32 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, (__v4si) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, @@ -7486,13 +7278,13 @@ _mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtusepi64_epi16 (__m128i __A) { return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, @@ -7500,14 +7292,14 @@ _mm_cvtusepi64_epi16 (__m128i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, @@ -7515,13 +7307,13 @@ _mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtusepi64_epi16 (__m256i __A) { return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, @@ -7529,14 +7321,14 @@ _mm256_cvtusepi64_epi16 (__m256i __A) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, @@ -7544,28 +7336,28 @@ _mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { - return __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); + __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi32_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, @@ -7574,28 +7366,29 @@ _mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8si)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, @@ -7603,28 +7396,28 @@ _mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi32_epi16 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, - (__v8hi) _mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, @@ -7632,28 +7425,26 @@ _mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtepi32_epi16 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, - (__v8hi)_mm_setzero_si128 (), - (__mmask8) -1); + return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, @@ -7661,28 +7452,28 @@ _mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi64_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, - (__v16qi) _mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, @@ -7690,28 +7481,28 @@ _mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtepi64_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, - (__v16qi) _mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, (__v16qi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, @@ -7719,28 +7510,27 @@ _mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi64_epi32 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, (__v4si) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, @@ -7748,50 +7538,49 @@ _mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtepi64_epi32 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, - (__v4si) _mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_convertvector((__v4di)__A, __v4si); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, - (__v4si) __O, __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm256_cvtepi64_epi32(__A), + (__v4si)__O); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, - (__v4si) _mm_setzero_si128 (), - __M); + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm256_cvtepi64_epi32(__A), + (__v4si)_mm_setzero_si128()); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtepi64_epi16 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, - (__v8hi) _mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3, + 3, 3, 3, 3); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, @@ -7799,7 +7588,7 @@ _mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) { return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, @@ -7807,28 +7596,28 @@ _mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) { __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_cvtepi64_epi16 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, (__v8hi) __O, __M); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) { return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, @@ -7836,479 +7625,410 @@ _mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) __M); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) { __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); } -#define _mm256_extractf32x4_ps(A, imm) __extension__ ({ \ - (__m128)__builtin_shufflevector((__v8sf)(__m256)(A), \ - (__v8sf)_mm256_undefined_ps(), \ - ((imm) & 1) ? 4 : 0, \ - ((imm) & 1) ? 5 : 1, \ - ((imm) & 1) ? 6 : 2, \ - ((imm) & 1) ? 7 : 3); }) - -#define _mm256_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({ \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \ - (__v4sf)(W)); }) - -#define _mm256_maskz_extractf32x4_ps(U, A, imm) __extension__ ({ \ - (__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm256_extractf32x4_ps((A), (imm)), \ - (__v4sf)_mm_setzero_ps()); }) - -#define _mm256_extracti32x4_epi32(A, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8si)(__m256)(A), \ - (__v8si)_mm256_undefined_si256(), \ - ((imm) & 1) ? 4 : 0, \ - ((imm) & 1) ? 5 : 1, \ - ((imm) & 1) ? 6 : 2, \ - ((imm) & 1) ? 7 : 3); }) - -#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \ - (__v4si)(W)); }) - -#define _mm256_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm256_extracti32x4_epi32((A), (imm)), \ - (__v4si)_mm_setzero_si128()); }) - -#define _mm256_insertf32x4(A, B, imm) __extension__ ({ \ - (__m256)__builtin_shufflevector((__v8sf)(A), \ - (__v8sf)_mm256_castps128_ps256((__m128)(B)), \ - ((imm) & 0x1) ? 0 : 8, \ - ((imm) & 0x1) ? 1 : 9, \ - ((imm) & 0x1) ? 2 : 10, \ - ((imm) & 0x1) ? 3 : 11, \ - ((imm) & 0x1) ? 8 : 4, \ - ((imm) & 0x1) ? 9 : 5, \ - ((imm) & 0x1) ? 10 : 6, \ - ((imm) & 0x1) ? 11 : 7); }) - -#define _mm256_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_extractf32x4_ps(A, imm) \ + (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1) + +#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \ + (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U)) + +#define _mm256_maskz_extractf32x4_ps(U, A, imm) \ + (__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U)) + +#define _mm256_extracti32x4_epi32(A, imm) \ + (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1) + +#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \ + (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U)) + +#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \ + (__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U)) + +#define _mm256_insertf32x4(A, B, imm) \ + (__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \ + (__v4sf)(__m128)(B), (int)(imm)) + +#define _mm256_mask_insertf32x4(W, U, A, B, imm) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)(W)); }) + (__v8sf)(__m256)(W)) -#define _mm256_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_insertf32x4(U, A, B, imm) \ (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps()); }) - -#define _mm256_inserti32x4(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v8si)(A), \ - (__v8si)_mm256_castsi128_si256((__m128i)(B)), \ - ((imm) & 0x1) ? 0 : 8, \ - ((imm) & 0x1) ? 1 : 9, \ - ((imm) & 0x1) ? 2 : 10, \ - ((imm) & 0x1) ? 3 : 11, \ - ((imm) & 0x1) ? 8 : 4, \ - ((imm) & 0x1) ? 9 : 5, \ - ((imm) & 0x1) ? 10 : 6, \ - ((imm) & 0x1) ? 11 : 7); }) - -#define _mm256_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \ + (__v8sf)_mm256_setzero_ps()) + +#define _mm256_inserti32x4(A, B, imm) \ + (__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \ + (__v4si)(__m128i)(B), (int)(imm)) + +#define _mm256_mask_inserti32x4(W, U, A, B, imm) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)(W)); }) + (__v8si)(__m256i)(W)) -#define _mm256_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_inserti32x4(U, A, B, imm) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256()); }) + (__v8si)_mm256_setzero_si256()) -#define _mm_getmant_pd(A, B, C) __extension__({\ +#define _mm_getmant_pd(A, B, C) \ (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ (int)(((C)<<2) | (B)), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_getmant_pd(W, U, A, B, C) __extension__({\ +#define _mm_mask_getmant_pd(W, U, A, B, C) \ (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ (int)(((C)<<2) | (B)), \ (__v2df)(__m128d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_getmant_pd(U, A, B, C) __extension__({\ +#define _mm_maskz_getmant_pd(U, A, B, C) \ (__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ (int)(((C)<<2) | (B)), \ (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_getmant_pd(A, B, C) __extension__ ({ \ +#define _mm256_getmant_pd(A, B, C) \ (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ (int)(((C)<<2) | (B)), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \ +#define _mm256_mask_getmant_pd(W, U, A, B, C) \ (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ (int)(((C)<<2) | (B)), \ (__v4df)(__m256d)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_getmant_pd(U, A, B, C) __extension__ ({ \ +#define _mm256_maskz_getmant_pd(U, A, B, C) \ (__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ (int)(((C)<<2) | (B)), \ (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_getmant_ps(A, B, C) __extension__ ({ \ +#define _mm_getmant_ps(A, B, C) \ (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ (int)(((C)<<2) | (B)), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \ +#define _mm_mask_getmant_ps(W, U, A, B, C) \ (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ (int)(((C)<<2) | (B)), \ (__v4sf)(__m128)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_getmant_ps(U, A, B, C) __extension__ ({ \ +#define _mm_maskz_getmant_ps(U, A, B, C) \ (__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ (int)(((C)<<2) | (B)), \ (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_getmant_ps(A, B, C) __extension__ ({ \ +#define _mm256_getmant_ps(A, B, C) \ (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ (int)(((C)<<2) | (B)), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1); }) + (__mmask8)-1) -#define _mm256_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \ +#define _mm256_mask_getmant_ps(W, U, A, B, C) \ (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ (int)(((C)<<2) | (B)), \ (__v8sf)(__m256)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_getmant_ps(U, A, B, C) __extension__ ({ \ +#define _mm256_maskz_getmant_ps(U, A, B, C) \ (__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ (int)(((C)<<2) | (B)), \ (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ (__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \ (double const *)(addr), \ (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \ (long long const *)(addr), \ (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ (__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \ (double const *)(addr), \ (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \ (long long const *)(addr), \ (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ (__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \ (float const *)(addr), \ (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \ (int const *)(addr), \ (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ (__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \ (float const *)(addr), \ (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \ (int const *)(addr), \ (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ (__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \ (double const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \ (long long const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ (__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \ (double const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \ (long long const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ (__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \ (float const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ (__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \ (int const *)(addr), \ (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ (__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \ (float const *)(addr), \ (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\ +#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ (__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \ (int const *)(addr), \ (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale)); }) + (__mmask8)(mask), (int)(scale)) -#define _mm256_permutex_pd(X, C) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(X), \ - (__v4df)_mm256_undefined_pd(), \ - ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ - ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) +#define _mm256_permutex_pd(X, C) \ + (__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C)) -#define _mm256_mask_permutex_pd(W, U, X, C) __extension__ ({ \ +#define _mm256_mask_permutex_pd(W, U, X, C) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)(__m256d)(W)); }) + (__v4df)(__m256d)(W)) -#define _mm256_maskz_permutex_pd(U, X, C) __extension__ ({ \ +#define _mm256_maskz_permutex_pd(U, X, C) \ (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd()); }) + (__v4df)_mm256_setzero_pd()) -#define _mm256_permutex_epi64(X, C) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(__m256i)(X), \ - (__v4di)_mm256_undefined_si256(), \ - ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ - ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) +#define _mm256_permutex_epi64(X, C) \ + (__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C)) -#define _mm256_mask_permutex_epi64(W, U, X, C) __extension__ ({ \ +#define _mm256_mask_permutex_epi64(W, U, X, C) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)(__m256i)(W)); }) + (__v4di)(__m256i)(W)) -#define _mm256_maskz_permutex_epi64(U, X, C) __extension__ ({ \ +#define _mm256_maskz_permutex_epi64(U, X, C) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)_mm256_setzero_si256()); }) + (__v4di)_mm256_setzero_si256()) -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_permutexvar_pd (__m256i __X, __m256d __Y) { - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, - (__v4di) __X, - (__v4df) _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, __m256d __Y) { - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, - (__v4di) __X, - (__v4df) __W, - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutexvar_pd(__X, __Y), + (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) { - return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, - (__v4di) __X, - (__v4df) _mm256_setzero_pd (), - (__mmask8) __U); + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutexvar_pd(__X, __Y), + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, - (__v4di) __X, - (__v4di) _mm256_setzero_si256 (), - (__mmask8) __M); + return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, - (__v4di) __X, - (__v4di) _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_permutexvar_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, - (__v4di) __X, - (__v4di) __W, - __M); + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_permutexvar_epi64(__X, __Y), + (__v4di)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, - __m256 __Y) -{ - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, - (__v8si) __X, - (__v8sf) __W, - (__mmask8) __U); -} +#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) { - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, - (__v8si) __X, - (__v8sf) _mm256_setzero_ps (), - (__mmask8) __U); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutexvar_ps(__X, __Y), + (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_permutexvar_ps (__m256i __X, __m256 __Y) +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) { - return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, - (__v8si) __X, - (__v8sf) _mm256_undefined_si256 (), - (__mmask8) -1); + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutexvar_ps(__X, __Y), + (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, - (__v8si) __X, - (__v8si) _mm256_setzero_si256 (), - __M); -} +#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A)) -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) { - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, - (__v8si) __X, - (__v8si) __W, - (__mmask8) __M); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_permutexvar_epi32(__X, __Y), + (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) { - return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, - (__v8si) __X, - (__v8si) _mm256_undefined_si256(), - (__mmask8) -1); + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_permutexvar_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); } -#define _mm_alignr_epi32(A, B, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(A), \ - ((int)(imm) & 0x3) + 0, \ - ((int)(imm) & 0x3) + 1, \ - ((int)(imm) & 0x3) + 2, \ - ((int)(imm) & 0x3) + 3); }) +#define _mm_alignr_epi32(A, B, imm) \ + (__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(imm)) -#define _mm_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \ +#define _mm_mask_alignr_epi32(W, U, A, B, imm) \ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)(__m128i)(W)); }) + (__v4si)(__m128i)(W)) -#define _mm_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \ +#define _mm_maskz_alignr_epi32(U, A, B, imm) \ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)_mm_setzero_si128()); }) - -#define _mm256_alignr_epi32(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(A), \ - ((int)(imm) & 0x7) + 0, \ - ((int)(imm) & 0x7) + 1, \ - ((int)(imm) & 0x7) + 2, \ - ((int)(imm) & 0x7) + 3, \ - ((int)(imm) & 0x7) + 4, \ - ((int)(imm) & 0x7) + 5, \ - ((int)(imm) & 0x7) + 6, \ - ((int)(imm) & 0x7) + 7); }) - -#define _mm256_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({ \ + (__v4si)_mm_setzero_si128()) + +#define _mm256_alignr_epi32(A, B, imm) \ + (__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(imm)) + +#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)(__m256i)(W)); }) + (__v8si)(__m256i)(W)) -#define _mm256_maskz_alignr_epi32(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_alignr_epi32(U, A, B, imm) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256()); }) + (__v8si)_mm256_setzero_si256()) -#define _mm_alignr_epi64(A, B, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(A), \ - ((int)(imm) & 0x1) + 0, \ - ((int)(imm) & 0x1) + 1); }) +#define _mm_alignr_epi64(A, B, imm) \ + (__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(imm)) -#define _mm_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \ +#define _mm_mask_alignr_epi64(W, U, A, B, imm) \ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)(__m128i)(W)); }) + (__v2di)(__m128i)(W)) -#define _mm_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \ +#define _mm_maskz_alignr_epi64(U, A, B, imm) \ (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)_mm_setzero_di()); }) + (__v2di)_mm_setzero_si128()) -#define _mm256_alignr_epi64(A, B, imm) __extension__ ({ \ - (__m256i)__builtin_shufflevector((__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(A), \ - ((int)(imm) & 0x3) + 0, \ - ((int)(imm) & 0x3) + 1, \ - ((int)(imm) & 0x3) + 2, \ - ((int)(imm) & 0x3) + 3); }) +#define _mm256_alignr_epi64(A, B, imm) \ + (__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(imm)) -#define _mm256_mask_alignr_epi64(W, U, A, B, imm) __extension__ ({ \ +#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)(__m256i)(W)); }) + (__v4di)(__m256i)(W)) -#define _mm256_maskz_alignr_epi64(U, A, B, imm) __extension__ ({ \ +#define _mm256_maskz_alignr_epi64(U, A, B, imm) \ (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256()); }) + (__v4di)_mm256_setzero_si256()) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8316,7 +8036,7 @@ _mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8324,7 +8044,7 @@ _mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8332,7 +8052,7 @@ _mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8340,7 +8060,7 @@ _mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) (__v8sf)_mm256_setzero_ps()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8348,7 +8068,7 @@ _mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) { return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, @@ -8356,7 +8076,7 @@ _mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8364,7 +8084,7 @@ _mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) { return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, @@ -8372,27 +8092,27 @@ _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) (__v8sf)_mm256_setzero_ps()); } -#define _mm256_mask_shuffle_epi32(W, U, A, I) __extension__({\ +#define _mm256_mask_shuffle_epi32(W, U, A, I) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)(__m256i)(W)); }) + (__v8si)(__m256i)(W)) -#define _mm256_maskz_shuffle_epi32(U, A, I) __extension__({\ +#define _mm256_maskz_shuffle_epi32(U, A, I) \ (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)_mm256_setzero_si256()); }) + (__v8si)_mm256_setzero_si256()) -#define _mm_mask_shuffle_epi32(W, U, A, I) __extension__({\ +#define _mm_mask_shuffle_epi32(W, U, A, I) \ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)(__m128i)(W)); }) + (__v4si)(__m128i)(W)) -#define _mm_maskz_shuffle_epi32(U, A, I) __extension__({\ +#define _mm_maskz_shuffle_epi32(U, A, I) \ (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)_mm_setzero_si128()); }) + (__v4si)_mm_setzero_si128()) -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, @@ -8400,7 +8120,7 @@ _mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) (__v2df) __W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maskz_mov_pd (__mmask8 __U, __m128d __A) { return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, @@ -8408,7 +8128,7 @@ _mm_maskz_mov_pd (__mmask8 __U, __m128d __A) (__v2df) _mm_setzero_pd ()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, @@ -8416,7 +8136,7 @@ _mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) (__v4df) __W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) { return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, @@ -8424,7 +8144,7 @@ _mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) (__v4df) _mm256_setzero_pd ()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, @@ -8432,7 +8152,7 @@ _mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) (__v4sf) __W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_mov_ps (__mmask8 __U, __m128 __A) { return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, @@ -8440,7 +8160,7 @@ _mm_maskz_mov_ps (__mmask8 __U, __m128 __A) (__v4sf) _mm_setzero_ps ()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, @@ -8448,7 +8168,7 @@ _mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) (__v8sf) __W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) { return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, @@ -8456,7 +8176,7 @@ _mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) (__v8sf) _mm256_setzero_ps ()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, @@ -8464,7 +8184,7 @@ _mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) { return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, @@ -8473,7 +8193,7 @@ _mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) { return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, @@ -8481,7 +8201,7 @@ _mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) { return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, @@ -8490,7 +8210,7 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) (__mmask8) __U); } -static __inline __m128i __DEFAULT_FN_ATTRS +static __inline __m128i __DEFAULT_FN_ATTRS128 _mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION, @@ -8498,7 +8218,7 @@ _mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A) (__mmask8) __U); } -static __inline __m128i __DEFAULT_FN_ATTRS +static __inline __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A) { return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, _MM_FROUND_CUR_DIRECTION, @@ -8506,17 +8226,17 @@ _mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A) (__mmask8) __U); } -#define _mm_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \ +#define _mm_mask_cvt_roundps_ph(W, U, A, I) \ (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \ +#define _mm_maskz_cvt_roundps_ph(U, A, I) \ (__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -static __inline __m128i __DEFAULT_FN_ATTRS +static __inline __m128i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A) { return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION, @@ -8524,24 +8244,25 @@ _mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A) (__mmask8) __U); } -static __inline __m128i __DEFAULT_FN_ATTRS +static __inline __m128i __DEFAULT_FN_ATTRS256 _mm256_maskz_cvtps_ph ( __mmask8 __U, __m256 __A) { return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, _MM_FROUND_CUR_DIRECTION, (__v8hi) _mm_setzero_si128(), (__mmask8) __U); } -#define _mm256_mask_cvt_roundps_ph(W, U, A, I) __extension__ ({ \ +#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \ (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ (__v8hi)(__m128i)(W), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#define _mm256_maskz_cvt_roundps_ph(U, A, I) __extension__ ({ \ +#define _mm256_maskz_cvt_roundps_ph(U, A, I) \ (__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U)); }) + (__mmask8)(U)) -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif /* __AVX512VLINTRIN_H */ diff --git a/c_headers/avx512vlvbmi2intrin.h b/c_headers/avx512vlvbmi2intrin.h index d1ec4976f2..baaf565463 100644 --- a/c_headers/avx512vlvbmi2intrin.h +++ b/c_headers/avx512vlvbmi2intrin.h @@ -29,130 +29,120 @@ #define __AVX512VLVBMI2INTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256))) -static __inline __m128i __DEFAULT_FN_ATTRS -_mm128_setzero_hi(void) { - return (__m128i)(__v8hi){ 0, 0, 0, 0, 0, 0, 0, 0 }; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) { return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, (__v8hi) __S, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_compress_epi16(__mmask8 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) { return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, - (__v8hi) _mm128_setzero_hi(), + (__v8hi) _mm_setzero_si128(), __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) { return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, (__v16qi) __S, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_compress_epi8(__mmask16 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) { return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, - (__v16qi) _mm128_setzero_hi(), + (__v16qi) _mm_setzero_si128(), __U); } -static __inline__ void __DEFAULT_FN_ATTRS -_mm128_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) { __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, __U); } -static __inline__ void __DEFAULT_FN_ATTRS -_mm128_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) { __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) { return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, (__v8hi) __S, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_expand_epi16(__mmask8 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) { return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, - (__v8hi) _mm128_setzero_hi(), + (__v8hi) _mm_setzero_si128(), __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) { return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, (__v16qi) __S, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_expand_epi8(__mmask16 __U, __m128i __D) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) { return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, - (__v16qi) _mm128_setzero_hi(), + (__v16qi) _mm_setzero_si128(), __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, (__v8hi) __S, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, - (__v8hi) _mm128_setzero_hi(), + (__v8hi) _mm_setzero_si128(), __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, (__v16qi) __S, __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) { return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, - (__v16qi) _mm128_setzero_hi(), + (__v16qi) _mm_setzero_si128(), __U); } -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setzero_hi(void) { - return (__m256i)(__v16hi){ 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0 }; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) { return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, @@ -160,15 +150,15 @@ _mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) { return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, - (__v16hi) _mm256_setzero_hi(), + (__v16hi) _mm256_setzero_si256(), __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) { return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, @@ -176,29 +166,29 @@ _mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) { return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, - (__v32qi) _mm256_setzero_hi(), + (__v32qi) _mm256_setzero_si256(), __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) { __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, __U); } -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) { __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) { return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, @@ -206,15 +196,15 @@ _mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) { return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, - (__v16hi) _mm256_setzero_hi(), + (__v16hi) _mm256_setzero_si256(), __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) { return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, @@ -222,15 +212,15 @@ _mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) { return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, - (__v32qi) _mm256_setzero_hi(), + (__v32qi) _mm256_setzero_si256(), __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, @@ -238,15 +228,15 @@ _mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, - (__v16hi) _mm256_setzero_hi(), + (__v16hi) _mm256_setzero_si256(), __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, @@ -254,171 +244,183 @@ _mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) { return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, - (__v32qi) _mm256_setzero_hi(), + (__v32qi) _mm256_setzero_si256(), __U); } -#define _mm256_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ - (__m256i)__builtin_ia32_vpshldq256_mask((__v4di)(A), \ - (__v4di)(B), \ - (int)(I), \ - (__v4di)(S), \ - (__mmask8)(U)); }) +#define _mm256_shldi_epi64(A, B, I) \ + (__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(I)) + +#define _mm256_mask_shldi_epi64(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S)) #define _mm256_maskz_shldi_epi64(U, A, B, I) \ - _mm256_mask_shldi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256()) -#define _mm256_shldi_epi64(A, B, I) \ - _mm256_mask_shldi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) +#define _mm_shldi_epi64(A, B, I) \ + (__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(I)) -#define _mm128_mask_shldi_epi64(S, U, A, B, I) __extension__ ({ \ - (__m128i)__builtin_ia32_vpshldq128_mask((__v2di)(A), \ - (__v2di)(B), \ - (int)(I), \ - (__v2di)(S), \ - (__mmask8)(U)); }) +#define _mm_mask_shldi_epi64(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S)) -#define _mm128_maskz_shldi_epi64(U, A, B, I) \ - _mm128_mask_shldi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I)) +#define _mm_maskz_shldi_epi64(U, A, B, I) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128()) -#define _mm128_shldi_epi64(A, B, I) \ - _mm128_mask_shldi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) +#define _mm256_shldi_epi32(A, B, I) \ + (__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(I)) -#define _mm256_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ - (__m256i)__builtin_ia32_vpshldd256_mask((__v8si)(A), \ - (__v8si)(B), \ - (int)(I), \ - (__v8si)(S), \ - (__mmask8)(U)); }) +#define _mm256_mask_shldi_epi32(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S)) #define _mm256_maskz_shldi_epi32(U, A, B, I) \ - _mm256_mask_shldi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256()) -#define _mm256_shldi_epi32(A, B, I) \ - _mm256_mask_shldi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) +#define _mm_shldi_epi32(A, B, I) \ + (__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I)) -#define _mm128_mask_shldi_epi32(S, U, A, B, I) __extension__ ({ \ - (__m128i)__builtin_ia32_vpshldd128_mask((__v4si)(A), \ - (__v4si)(B), \ - (int)(I), \ - (__v4si)(S), \ - (__mmask8)(U)); }) +#define _mm_mask_shldi_epi32(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S)) -#define _mm128_maskz_shldi_epi32(U, A, B, I) \ - _mm128_mask_shldi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I)) +#define _mm_maskz_shldi_epi32(U, A, B, I) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128()) -#define _mm128_shldi_epi32(A, B, I) \ - _mm128_mask_shldi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) +#define _mm256_shldi_epi16(A, B, I) \ + (__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), (int)(I)) -#define _mm256_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ - (__m256i)__builtin_ia32_vpshldw256_mask((__v16hi)(A), \ - (__v16hi)(B), \ - (int)(I), \ - (__v16hi)(S), \ - (__mmask16)(U)); }) +#define _mm256_mask_shldi_epi16(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S)) #define _mm256_maskz_shldi_epi16(U, A, B, I) \ - _mm256_mask_shldi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256()) -#define _mm256_shldi_epi16(A, B, I) \ - _mm256_mask_shldi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) +#define _mm_shldi_epi16(A, B, I) \ + (__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (int)(I)) -#define _mm128_mask_shldi_epi16(S, U, A, B, I) __extension__ ({ \ - (__m128i)__builtin_ia32_vpshldw128_mask((__v8hi)(A), \ - (__v8hi)(B), \ - (int)(I), \ - (__v8hi)(S), \ - (__mmask8)(U)); }) +#define _mm_mask_shldi_epi16(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S)) -#define _mm128_maskz_shldi_epi16(U, A, B, I) \ - _mm128_mask_shldi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I)) +#define _mm_maskz_shldi_epi16(U, A, B, I) \ + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128()) -#define _mm128_shldi_epi16(A, B, I) \ - _mm128_mask_shldi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) +#define _mm256_shrdi_epi64(A, B, I) \ + (__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(I)) -#define _mm256_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ - (__m256i)__builtin_ia32_vpshrdq256_mask((__v4di)(A), \ - (__v4di)(B), \ - (int)(I), \ - (__v4di)(S), \ - (__mmask8)(U)); }) +#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S)) #define _mm256_maskz_shrdi_epi64(U, A, B, I) \ - _mm256_mask_shrdi_epi64(_mm256_setzero_hi(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256()) -#define _mm256_shrdi_epi64(A, B, I) \ - _mm256_mask_shrdi_epi64(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) +#define _mm_shrdi_epi64(A, B, I) \ + (__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(I)) -#define _mm128_mask_shrdi_epi64(S, U, A, B, I) __extension__ ({ \ - (__m128i)__builtin_ia32_vpshrdq128_mask((__v2di)(A), \ - (__v2di)(B), \ - (int)(I), \ - (__v2di)(S), \ - (__mmask8)(U)); }) +#define _mm_mask_shrdi_epi64(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S)) -#define _mm128_maskz_shrdi_epi64(U, A, B, I) \ - _mm128_mask_shrdi_epi64(_mm128_setzero_hi(), (U), (A), (B), (I)) +#define _mm_maskz_shrdi_epi64(U, A, B, I) \ + (__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128()) -#define _mm128_shrdi_epi64(A, B, I) \ - _mm128_mask_shrdi_epi64(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) +#define _mm256_shrdi_epi32(A, B, I) \ + (__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(I)) -#define _mm256_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ - (__m256i)__builtin_ia32_vpshrdd256_mask((__v8si)(A), \ - (__v8si)(B), \ - (int)(I), \ - (__v8si)(S), \ - (__mmask8)(U)); }) +#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S)) #define _mm256_maskz_shrdi_epi32(U, A, B, I) \ - _mm256_mask_shrdi_epi32(_mm256_setzero_hi(), (U), (A), (B), (I)) + (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256()) -#define _mm256_shrdi_epi32(A, B, I) \ - _mm256_mask_shrdi_epi32(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) +#define _mm_shrdi_epi32(A, B, I) \ + (__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I)) -#define _mm128_mask_shrdi_epi32(S, U, A, B, I) __extension__ ({ \ - (__m128i)__builtin_ia32_vpshrdd128_mask((__v4si)(A), \ - (__v4si)(B), \ - (int)(I), \ - (__v4si)(S), \ - (__mmask8)(U)); }) +#define _mm_mask_shrdi_epi32(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S)) -#define _mm128_maskz_shrdi_epi32(U, A, B, I) \ - _mm128_mask_shrdi_epi32(_mm128_setzero_hi(), (U), (A), (B), (I)) +#define _mm_maskz_shrdi_epi32(U, A, B, I) \ + (__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128()) -#define _mm128_shrdi_epi32(A, B, I) \ - _mm128_mask_shrdi_epi32(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) +#define _mm256_shrdi_epi16(A, B, I) \ + (__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), (int)(I)) -#define _mm256_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ - (__m256i)__builtin_ia32_vpshrdw256_mask((__v16hi)(A), \ - (__v16hi)(B), \ - (int)(I), \ - (__v16hi)(S), \ - (__mmask16)(U)); }) +#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S)) #define _mm256_maskz_shrdi_epi16(U, A, B, I) \ - _mm256_mask_shrdi_epi16(_mm256_setzero_hi(), (U), (A), (B), (I)) - -#define _mm256_shrdi_epi16(A, B, I) \ - _mm256_mask_shrdi_epi16(_mm256_undefined_si256(), (__mmask8)(-1), (A), (B), (I)) + (__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256()) -#define _mm128_mask_shrdi_epi16(S, U, A, B, I) __extension__ ({ \ - (__m128i)__builtin_ia32_vpshrdw128_mask((__v8hi)(A), \ - (__v8hi)(B), \ - (int)(I), \ - (__v8hi)(S), \ - (__mmask8)(U)); }) +#define _mm_shrdi_epi16(A, B, I) \ + (__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (int)(I)) -#define _mm128_maskz_shrdi_epi16(U, A, B, I) \ - _mm128_mask_shrdi_epi16(_mm128_setzero_hi(), (U), (A), (B), (I)) +#define _mm_mask_shrdi_epi16(S, U, A, B, I) \ + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S)) -#define _mm128_shrdi_epi16(A, B, I) \ - _mm128_mask_shrdi_epi16(_mm_undefined_si128(), (__mmask8)(-1), (A), (B), (I)) +#define _mm_maskz_shrdi_epi16(U, A, B, I) \ + (__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128()) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, @@ -427,7 +429,7 @@ _mm256_mask_shldv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvq256_maskz ((__v4di) __S, @@ -436,7 +438,7 @@ _mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvq256_mask ((__v4di) __S, @@ -445,8 +447,8 @@ _mm256_shldv_epi64(__m256i __S, __m256i __A, __m256i __B) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, (__v2di) __A, @@ -454,8 +456,8 @@ _mm128_mask_shldv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvq128_maskz ((__v2di) __S, (__v2di) __A, @@ -463,8 +465,8 @@ _mm128_maskz_shldv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvq128_mask ((__v2di) __S, (__v2di) __A, @@ -472,7 +474,7 @@ _mm128_shldv_epi64(__m128i __S, __m128i __A, __m128i __B) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, @@ -481,7 +483,7 @@ _mm256_mask_shldv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvd256_maskz ((__v8si) __S, @@ -490,7 +492,7 @@ _mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvd256_mask ((__v8si) __S, @@ -499,8 +501,8 @@ _mm256_shldv_epi32(__m256i __S, __m256i __A, __m256i __B) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, (__v4si) __A, @@ -508,8 +510,8 @@ _mm128_mask_shldv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvd128_maskz ((__v4si) __S, (__v4si) __A, @@ -517,8 +519,8 @@ _mm128_maskz_shldv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvd128_mask ((__v4si) __S, (__v4si) __A, @@ -526,7 +528,7 @@ _mm128_shldv_epi32(__m128i __S, __m128i __A, __m128i __B) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, @@ -535,7 +537,7 @@ _mm256_mask_shldv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvw256_maskz ((__v16hi) __S, @@ -544,7 +546,7 @@ _mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshldvw256_mask ((__v16hi) __S, @@ -553,8 +555,8 @@ _mm256_shldv_epi16(__m256i __S, __m256i __A, __m256i __B) (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, (__v8hi) __A, @@ -562,8 +564,8 @@ _mm128_mask_shldv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvw128_maskz ((__v8hi) __S, (__v8hi) __A, @@ -571,8 +573,8 @@ _mm128_maskz_shldv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshldvw128_mask ((__v8hi) __S, (__v8hi) __A, @@ -580,7 +582,7 @@ _mm128_shldv_epi16(__m128i __S, __m128i __A, __m128i __B) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, @@ -589,7 +591,7 @@ _mm256_mask_shrdv_epi64(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvq256_maskz ((__v4di) __S, @@ -598,7 +600,7 @@ _mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvq256_mask ((__v4di) __S, @@ -607,8 +609,8 @@ _mm256_shrdv_epi64(__m256i __S, __m256i __A, __m256i __B) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, (__v2di) __A, @@ -616,8 +618,8 @@ _mm128_mask_shrdv_epi64(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvq128_maskz ((__v2di) __S, (__v2di) __A, @@ -625,8 +627,8 @@ _mm128_maskz_shrdv_epi64(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvq128_mask ((__v2di) __S, (__v2di) __A, @@ -634,7 +636,7 @@ _mm128_shrdv_epi64(__m128i __S, __m128i __A, __m128i __B) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, @@ -643,7 +645,7 @@ _mm256_mask_shrdv_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvd256_maskz ((__v8si) __S, @@ -652,7 +654,7 @@ _mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvd256_mask ((__v8si) __S, @@ -661,8 +663,8 @@ _mm256_shrdv_epi32(__m256i __S, __m256i __A, __m256i __B) (__mmask8) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, (__v4si) __A, @@ -670,8 +672,8 @@ _mm128_mask_shrdv_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvd128_maskz ((__v4si) __S, (__v4si) __A, @@ -679,8 +681,8 @@ _mm128_maskz_shrdv_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvd128_mask ((__v4si) __S, (__v4si) __A, @@ -688,7 +690,7 @@ _mm128_shrdv_epi32(__m128i __S, __m128i __A, __m128i __B) (__mmask8) -1); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, @@ -697,7 +699,7 @@ _mm256_mask_shrdv_epi16(__m256i __S, __mmask16 __U, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvw256_maskz ((__v16hi) __S, @@ -706,7 +708,7 @@ _mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __S, __m256i __A, __m256i __B) __U); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vpshrdvw256_mask ((__v16hi) __S, @@ -715,8 +717,8 @@ _mm256_shrdv_epi16(__m256i __S, __m256i __A, __m256i __B) (__mmask16) -1); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, (__v8hi) __A, @@ -724,8 +726,8 @@ _mm128_mask_shrdv_epi16(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvw128_maskz ((__v8hi) __S, (__v8hi) __A, @@ -733,8 +735,8 @@ _mm128_maskz_shrdv_epi16(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) __U); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_vpshrdvw128_mask ((__v8hi) __S, (__v8hi) __A, @@ -743,6 +745,7 @@ _mm128_shrdv_epi16(__m128i __S, __m128i __A, __m128i __B) } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avx512vlvnniintrin.h b/c_headers/avx512vlvnniintrin.h index 745ae8b7ad..62382268ec 100644 --- a/c_headers/avx512vlvnniintrin.h +++ b/c_headers/avx512vlvnniintrin.h @@ -29,226 +29,195 @@ #define __AVX512VLVNNIINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, + (__v8si)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), + (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpbusd256_maskz ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpbusd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, + (__v8si)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), + (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpbusds256_maskz ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpbusds256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, + (__v8si)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), + (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpwssd256_maskz ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpwssd256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, + (__v8si)__B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), + (__v8si)__S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) { - return (__m256i) __builtin_ia32_vpdpwssds256_maskz ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) __U); + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); } -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m256i) __builtin_ia32_vpdpwssds256_mask ((__v8si) __S, - (__v8si) __A, - (__v8si) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, + (__v4si)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusd_epi32(__S, __A, __B), + (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpbusd128_maskz ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusd_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpbusd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, + (__v4si)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusds_epi32(__S, __A, __B), + (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpbusds128_maskz ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusds_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpbusds128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, + (__v4si)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssd_epi32(__S, __A, __B), + (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpwssd128_maskz ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssd_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpwssd128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); + return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, + (__v4si)__B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssds_epi32(__S, __A, __B), + (__v4si)__S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) { - return (__m128i) __builtin_ia32_vpdpwssds128_maskz ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) __U); + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssds_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); } -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm128_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_vpdpwssds128_mask ((__v4si) __S, - (__v4si) __A, - (__v4si) __B, - (__mmask8) -1); -} - - -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avx512vnniintrin.h b/c_headers/avx512vnniintrin.h index 0c6badd231..620ef5a789 100644 --- a/c_headers/avx512vnniintrin.h +++ b/c_headers/avx512vnniintrin.h @@ -29,118 +29,101 @@ #define __AVX512VNNIINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), + (__v16si)__S); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpbusd512_maskz ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) +_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpbusd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A, + (__v16si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), + (__v16si)__S); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpbusds512_maskz ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) +_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpbusds512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A, + (__v16si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), + (__v16si)__S); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpwssd512_maskz ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); } static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) +_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpwssd512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A, + (__v16si)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), + (__v16si)__S); } static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) { - return (__m512i) __builtin_ia32_vpdpwssds512_maskz ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_vpdpwssds512_mask ((__v16si) __S, - (__v16si) __A, - (__v16si) __B, - (__mmask16) -1); + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); } - #undef __DEFAULT_FN_ATTRS #endif diff --git a/c_headers/avx512vpopcntdqintrin.h b/c_headers/avx512vpopcntdqintrin.h index 34ab84932e..c99f594569 100644 --- a/c_headers/avx512vpopcntdqintrin.h +++ b/c_headers/avx512vpopcntdqintrin.h @@ -1,5 +1,4 @@ -/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics - *------------------=== +/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------=== * * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -32,8 +31,7 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntd" \ - "q"))) + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512))) static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A); diff --git a/c_headers/avx512vpopcntdqvlintrin.h b/c_headers/avx512vpopcntdqvlintrin.h index c2058a8f51..681a75fa07 100644 --- a/c_headers/avx512vpopcntdqvlintrin.h +++ b/c_headers/avx512vpopcntdqvlintrin.h @@ -1,5 +1,4 @@ -/*===------------- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics - *------------------=== +/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------=== * * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -31,69 +30,76 @@ #define __AVX512VPOPCNTDQVLINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256))) -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi64(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi64(__m128i __A) { return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectq_128( (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_popcnt_epi32(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi32(__m128i __A) { return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { return (__m128i)__builtin_ia32_selectd_128( (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W); } -static __inline__ __m128i __DEFAULT_FN_ATTRS +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi64(__m256i __A) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_popcnt_epi64(__m256i __A) { return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectq_256( (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS _mm256_popcnt_epi32(__m256i __A) { +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_popcnt_epi32(__m256i __A) { return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { return (__m256i)__builtin_ia32_selectd_256( (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif diff --git a/c_headers/avxintrin.h b/c_headers/avxintrin.h index dff5897b6b..cb15396b3f 100644 --- a/c_headers/avxintrin.h +++ b/c_headers/avxintrin.h @@ -50,10 +50,11 @@ typedef double __m256d __attribute__((__vector_size__(32))); typedef long long __m256i __attribute__((__vector_size__(32))); /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128))) /* Arithmetic */ -/// \brief Adds two 256-bit vectors of [4 x double]. +/// Adds two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -71,7 +72,7 @@ _mm256_add_pd(__m256d __a, __m256d __b) return (__m256d)((__v4df)__a+(__v4df)__b); } -/// \brief Adds two 256-bit vectors of [8 x float]. +/// Adds two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -89,7 +90,7 @@ _mm256_add_ps(__m256 __a, __m256 __b) return (__m256)((__v8sf)__a+(__v8sf)__b); } -/// \brief Subtracts two 256-bit vectors of [4 x double]. +/// Subtracts two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -107,7 +108,7 @@ _mm256_sub_pd(__m256d __a, __m256d __b) return (__m256d)((__v4df)__a-(__v4df)__b); } -/// \brief Subtracts two 256-bit vectors of [8 x float]. +/// Subtracts two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -125,7 +126,7 @@ _mm256_sub_ps(__m256 __a, __m256 __b) return (__m256)((__v8sf)__a-(__v8sf)__b); } -/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// Adds the even-indexed values and subtracts the odd-indexed values of /// two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -144,7 +145,7 @@ _mm256_addsub_pd(__m256d __a, __m256d __b) return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); } -/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// Adds the even-indexed values and subtracts the odd-indexed values of /// two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -163,7 +164,7 @@ _mm256_addsub_ps(__m256 __a, __m256 __b) return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Divides two 256-bit vectors of [4 x double]. +/// Divides two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -181,7 +182,7 @@ _mm256_div_pd(__m256d __a, __m256d __b) return (__m256d)((__v4df)__a/(__v4df)__b); } -/// \brief Divides two 256-bit vectors of [8 x float]. +/// Divides two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -199,7 +200,7 @@ _mm256_div_ps(__m256 __a, __m256 __b) return (__m256)((__v8sf)__a/(__v8sf)__b); } -/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater +/// Compares two 256-bit vectors of [4 x double] and returns the greater /// of each pair of values. /// /// \headerfile <x86intrin.h> @@ -218,7 +219,7 @@ _mm256_max_pd(__m256d __a, __m256d __b) return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); } -/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater +/// Compares two 256-bit vectors of [8 x float] and returns the greater /// of each pair of values. /// /// \headerfile <x86intrin.h> @@ -237,7 +238,7 @@ _mm256_max_ps(__m256 __a, __m256 __b) return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser +/// Compares two 256-bit vectors of [4 x double] and returns the lesser /// of each pair of values. /// /// \headerfile <x86intrin.h> @@ -256,7 +257,7 @@ _mm256_min_pd(__m256d __a, __m256d __b) return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); } -/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser +/// Compares two 256-bit vectors of [8 x float] and returns the lesser /// of each pair of values. /// /// \headerfile <x86intrin.h> @@ -275,7 +276,7 @@ _mm256_min_ps(__m256 __a, __m256 __b) return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Multiplies two 256-bit vectors of [4 x double]. +/// Multiplies two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -293,7 +294,7 @@ _mm256_mul_pd(__m256d __a, __m256d __b) return (__m256d)((__v4df)__a * (__v4df)__b); } -/// \brief Multiplies two 256-bit vectors of [8 x float]. +/// Multiplies two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -311,7 +312,7 @@ _mm256_mul_ps(__m256 __a, __m256 __b) return (__m256)((__v8sf)__a * (__v8sf)__b); } -/// \brief Calculates the square roots of the values in a 256-bit vector of +/// Calculates the square roots of the values in a 256-bit vector of /// [4 x double]. /// /// \headerfile <x86intrin.h> @@ -328,7 +329,7 @@ _mm256_sqrt_pd(__m256d __a) return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); } -/// \brief Calculates the square roots of the values in a 256-bit vector of +/// Calculates the square roots of the values in a 256-bit vector of /// [8 x float]. /// /// \headerfile <x86intrin.h> @@ -345,7 +346,7 @@ _mm256_sqrt_ps(__m256 __a) return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); } -/// \brief Calculates the reciprocal square roots of the values in a 256-bit +/// Calculates the reciprocal square roots of the values in a 256-bit /// vector of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -362,7 +363,7 @@ _mm256_rsqrt_ps(__m256 __a) return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); } -/// \brief Calculates the reciprocals of the values in a 256-bit vector of +/// Calculates the reciprocals of the values in a 256-bit vector of /// [8 x float]. /// /// \headerfile <x86intrin.h> @@ -379,7 +380,7 @@ _mm256_rcp_ps(__m256 __a) return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); } -/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified +/// Rounds the values in a 256-bit vector of [4 x double] as specified /// by the byte operand. The source values are rounded to integer values and /// returned as 64-bit double-precision floating-point values. /// @@ -408,10 +409,10 @@ _mm256_rcp_ps(__m256 __a) /// 10: Upward (toward positive infinity). \n /// 11: Truncated. /// \returns A 256-bit vector of [4 x double] containing the rounded values. -#define _mm256_round_pd(V, M) __extension__ ({ \ - (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); }) +#define _mm256_round_pd(V, M) \ + (__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)) -/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as +/// Rounds the values stored in a 256-bit vector of [8 x float] as /// specified by the byte operand. The source values are rounded to integer /// values and returned as floating-point values. /// @@ -440,10 +441,10 @@ _mm256_rcp_ps(__m256 __a) /// 10: Upward (toward positive infinity). \n /// 11: Truncated. /// \returns A 256-bit vector of [8 x float] containing the rounded values. -#define _mm256_round_ps(V, M) __extension__ ({ \ - (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); }) +#define _mm256_round_ps(V, M) \ + (__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)) -/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The +/// Rounds up the values stored in a 256-bit vector of [4 x double]. The /// source values are rounded up to integer values and returned as 64-bit /// double-precision floating-point values. /// @@ -460,7 +461,7 @@ _mm256_rcp_ps(__m256 __a) /// \returns A 256-bit vector of [4 x double] containing the rounded up values. #define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) -/// \brief Rounds down the values stored in a 256-bit vector of [4 x double]. +/// Rounds down the values stored in a 256-bit vector of [4 x double]. /// The source values are rounded down to integer values and returned as /// 64-bit double-precision floating-point values. /// @@ -478,7 +479,7 @@ _mm256_rcp_ps(__m256 __a) /// values. #define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) -/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The +/// Rounds up the values stored in a 256-bit vector of [8 x float]. The /// source values are rounded up to integer values and returned as /// floating-point values. /// @@ -495,7 +496,7 @@ _mm256_rcp_ps(__m256 __a) /// \returns A 256-bit vector of [8 x float] containing the rounded up values. #define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) -/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The +/// Rounds down the values stored in a 256-bit vector of [8 x float]. The /// source values are rounded down to integer values and returned as /// floating-point values. /// @@ -513,7 +514,7 @@ _mm256_rcp_ps(__m256 __a) #define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) /* Logical */ -/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double]. +/// Performs a bitwise AND of two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -531,7 +532,7 @@ _mm256_and_pd(__m256d __a, __m256d __b) return (__m256d)((__v4du)__a & (__v4du)__b); } -/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float]. +/// Performs a bitwise AND of two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -549,7 +550,7 @@ _mm256_and_ps(__m256 __a, __m256 __b) return (__m256)((__v8su)__a & (__v8su)__b); } -/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using +/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using /// the one's complement of the values contained in the first source operand. /// /// \headerfile <x86intrin.h> @@ -570,7 +571,7 @@ _mm256_andnot_pd(__m256d __a, __m256d __b) return (__m256d)(~(__v4du)__a & (__v4du)__b); } -/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using +/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using /// the one's complement of the values contained in the first source operand. /// /// \headerfile <x86intrin.h> @@ -591,7 +592,7 @@ _mm256_andnot_ps(__m256 __a, __m256 __b) return (__m256)(~(__v8su)__a & (__v8su)__b); } -/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double]. +/// Performs a bitwise OR of two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -609,7 +610,7 @@ _mm256_or_pd(__m256d __a, __m256d __b) return (__m256d)((__v4du)__a | (__v4du)__b); } -/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float]. +/// Performs a bitwise OR of two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -627,7 +628,7 @@ _mm256_or_ps(__m256 __a, __m256 __b) return (__m256)((__v8su)__a | (__v8su)__b); } -/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double]. +/// Performs a bitwise XOR of two 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -645,7 +646,7 @@ _mm256_xor_pd(__m256d __a, __m256d __b) return (__m256d)((__v4du)__a ^ (__v4du)__b); } -/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float]. +/// Performs a bitwise XOR of two 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -664,7 +665,7 @@ _mm256_xor_ps(__m256 __a, __m256 __b) } /* Horizontal arithmetic */ -/// \brief Horizontally adds the adjacent pairs of values contained in two +/// Horizontally adds the adjacent pairs of values contained in two /// 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -687,7 +688,7 @@ _mm256_hadd_pd(__m256d __a, __m256d __b) return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in two +/// Horizontally adds the adjacent pairs of values contained in two /// 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -710,7 +711,7 @@ _mm256_hadd_ps(__m256 __a, __m256 __b) return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in two +/// Horizontally subtracts the adjacent pairs of values contained in two /// 256-bit vectors of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -733,7 +734,7 @@ _mm256_hsub_pd(__m256d __a, __m256d __b) return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in two +/// Horizontally subtracts the adjacent pairs of values contained in two /// 256-bit vectors of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -757,7 +758,7 @@ _mm256_hsub_ps(__m256 __a, __m256 __b) } /* Vector permutations */ -/// \brief Copies the values in a 128-bit vector of [2 x double] as specified +/// Copies the values in a 128-bit vector of [2 x double] as specified /// by the 128-bit integer vector operand. /// /// \headerfile <x86intrin.h> @@ -780,13 +781,13 @@ _mm256_hsub_ps(__m256 __a, __m256 __b) /// 1: Bits [127:64] of the source are copied to bits [127:64] of the /// returned vector. /// \returns A 128-bit vector of [2 x double] containing the copied values. -static __inline __m128d __DEFAULT_FN_ATTRS +static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_permutevar_pd(__m128d __a, __m128i __c) { return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); } -/// \brief Copies the values in a 256-bit vector of [4 x double] as specified +/// Copies the values in a 256-bit vector of [4 x double] as specified /// by the 256-bit integer vector operand. /// /// \headerfile <x86intrin.h> @@ -825,7 +826,7 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c) return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); } -/// \brief Copies the values stored in a 128-bit vector of [4 x float] as +/// Copies the values stored in a 128-bit vector of [4 x float] as /// specified by the 128-bit integer vector operand. /// \headerfile <x86intrin.h> /// @@ -873,13 +874,13 @@ _mm256_permutevar_pd(__m256d __a, __m256i __c) /// 11: Bits [127:96] of the source are copied to bits [127:96] of the /// returned vector. /// \returns A 128-bit vector of [4 x float] containing the copied values. -static __inline __m128 __DEFAULT_FN_ATTRS +static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_permutevar_ps(__m128 __a, __m128i __c) { return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); } -/// \brief Copies the values stored in a 256-bit vector of [8 x float] as +/// Copies the values stored in a 256-bit vector of [8 x float] as /// specified by the 256-bit integer vector operand. /// /// \headerfile <x86intrin.h> @@ -970,7 +971,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); } -/// \brief Copies the values in a 128-bit vector of [2 x double] as specified +/// Copies the values in a 128-bit vector of [2 x double] as specified /// by the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -997,12 +998,10 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 1: Bits [127:64] of the source are copied to bits [127:64] of the /// returned vector. /// \returns A 128-bit vector of [2 x double] containing the copied values. -#define _mm_permute_pd(A, C) __extension__ ({ \ - (__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \ - (__v2df)_mm_undefined_pd(), \ - ((C) >> 0) & 0x1, ((C) >> 1) & 0x1); }) +#define _mm_permute_pd(A, C) \ + (__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)) -/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by +/// Copies the values in a 256-bit vector of [4 x double] as specified by /// the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -1039,15 +1038,10 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 1: Bits [255:192] of the source are copied to bits [255:192] of the /// returned vector. /// \returns A 256-bit vector of [4 x double] containing the copied values. -#define _mm256_permute_pd(A, C) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \ - (__v4df)_mm256_undefined_pd(), \ - 0 + (((C) >> 0) & 0x1), \ - 0 + (((C) >> 1) & 0x1), \ - 2 + (((C) >> 2) & 0x1), \ - 2 + (((C) >> 3) & 0x1)); }) - -/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by +#define _mm256_permute_pd(A, C) \ + (__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)) + +/// Copies the values in a 128-bit vector of [4 x float] as specified by /// the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -1100,13 +1094,10 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 11: Bits [127:96] of the source are copied to bits [127:96] of the /// returned vector. /// \returns A 128-bit vector of [4 x float] containing the copied values. -#define _mm_permute_ps(A, C) __extension__ ({ \ - (__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \ - (__v4sf)_mm_undefined_ps(), \ - ((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \ - ((C) >> 4) & 0x3, ((C) >> 6) & 0x3); }) +#define _mm_permute_ps(A, C) \ + (__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)) -/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by +/// Copies the values in a 256-bit vector of [8 x float] as specified by /// the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -1120,7 +1111,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// \param A /// A 256-bit vector of [8 x float]. /// \param C -/// An immediate integer operand specifying how the values are to be \n +/// An immediate integer operand specifying how the values are to be /// copied. \n /// Bits [1:0]: \n /// 00: Bits [31:0] of the source are copied to bits [31:0] of the @@ -1150,7 +1141,7 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 11: Bits [127:96] of the source are copied to bits [95:64] of the /// returned vector. \n /// Bits [7:6]: \n -/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the /// returned vector. \n /// 01: Bits [63:32] of the source are copied to bits [127:96] of the /// returned vector. \n @@ -1195,19 +1186,10 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 11: Bits [255:224] of the source are copied to bits [255:224] of the /// returned vector. /// \returns A 256-bit vector of [8 x float] containing the copied values. -#define _mm256_permute_ps(A, C) __extension__ ({ \ - (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \ - (__v8sf)_mm256_undefined_ps(), \ - 0 + (((C) >> 0) & 0x3), \ - 0 + (((C) >> 2) & 0x3), \ - 0 + (((C) >> 4) & 0x3), \ - 0 + (((C) >> 6) & 0x3), \ - 4 + (((C) >> 0) & 0x3), \ - 4 + (((C) >> 2) & 0x3), \ - 4 + (((C) >> 4) & 0x3), \ - 4 + (((C) >> 6) & 0x3)); }) - -/// \brief Permutes 128-bit data values stored in two 256-bit vectors of +#define _mm256_permute_ps(A, C) \ + (__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)) + +/// Permutes 128-bit data values stored in two 256-bit vectors of /// [4 x double], as specified by the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -1244,11 +1226,11 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. -#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \ +#define _mm256_permute2f128_pd(V1, V2, M) \ (__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), (M)); }) + (__v4df)(__m256d)(V2), (int)(M)) -/// \brief Permutes 128-bit data values stored in two 256-bit vectors of +/// Permutes 128-bit data values stored in two 256-bit vectors of /// [8 x float], as specified by the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -1285,11 +1267,11 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. -#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \ +#define _mm256_permute2f128_ps(V1, V2, M) \ (__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (M)); }) + (__v8sf)(__m256)(V2), (int)(M)) -/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors, +/// Permutes 128-bit data values stored in two 256-bit integer vectors, /// as specified by the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -1325,12 +1307,12 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the /// destination. /// \returns A 256-bit integer vector containing the copied values. -#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \ +#define _mm256_permute2f128_si256(V1, V2, M) \ (__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), (M)); }) + (__v8si)(__m256i)(V2), (int)(M)) /* Vector Blend */ -/// \brief Merges 64-bit double-precision data values stored in either of the +/// Merges 64-bit double-precision data values stored in either of the /// two 256-bit vectors of [4 x double], as specified by the immediate /// integer operand. /// @@ -1354,15 +1336,11 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// destination. When a mask bit is 1, the corresponding 64-bit element in /// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [4 x double] containing the copied values. -#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)); }) - -/// \brief Merges 32-bit single-precision data values stored in either of the +#define _mm256_blend_pd(V1, V2, M) \ + (__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (int)(M)) + +/// Merges 32-bit single-precision data values stored in either of the /// two 256-bit vectors of [8 x float], as specified by the immediate /// integer operand. /// @@ -1386,19 +1364,11 @@ _mm256_permutevar_ps(__m256 __a, __m256i __c) /// destination. When a mask bit is 1, the corresponding 32-bit element in /// operand \a V2 is copied to the same position in the destination. /// \returns A 256-bit vector of [8 x float] containing the copied values. -#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \ - (__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)); }) - -/// \brief Merges 64-bit double-precision data values stored in either of the +#define _mm256_blend_ps(V1, V2, M) \ + (__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (int)(M)) + +/// Merges 64-bit double-precision data values stored in either of the /// two 256-bit vectors of [4 x double], as specified by the 256-bit vector /// operand. /// @@ -1426,7 +1396,7 @@ _mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) (__v4df)__a, (__v4df)__b, (__v4df)__c); } -/// \brief Merges 32-bit single-precision data values stored in either of the +/// Merges 32-bit single-precision data values stored in either of the /// two 256-bit vectors of [8 x float], as specified by the 256-bit vector /// operand. /// @@ -1455,7 +1425,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) } /* Vector Dot Product */ -/// \brief Computes two dot products in parallel, using the lower and upper +/// Computes two dot products in parallel, using the lower and upper /// halves of two [8 x float] vectors as input to the two computations, and /// returning the two dot products in the lower and upper halves of the /// [8 x float] result. @@ -1492,12 +1462,12 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// is set to zero. The bitmask is applied in the same way to each of the /// two parallel dot product computations. /// \returns A 256-bit vector of [8 x float] containing the two dot products. -#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \ +#define _mm256_dp_ps(V1, V2, M) \ (__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (M)); }) + (__v8sf)(__m256)(V2), (M)) /* Vector shuffle */ -/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as +/// Selects 8 float values from the 256-bit operands of [8 x float], as /// specified by the immediate value operand. /// /// The four selected elements in each operand are copied to the destination @@ -1546,19 +1516,11 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n /// 11: Bits [127:96] and [255:224] are copied from the selected operand. /// \returns A 256-bit vector of [8 x float] containing the shuffled values. -#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \ - (__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), \ - 0 + (((mask) >> 0) & 0x3), \ - 0 + (((mask) >> 2) & 0x3), \ - 8 + (((mask) >> 4) & 0x3), \ - 8 + (((mask) >> 6) & 0x3), \ - 4 + (((mask) >> 0) & 0x3), \ - 4 + (((mask) >> 2) & 0x3), \ - 12 + (((mask) >> 4) & 0x3), \ - 12 + (((mask) >> 6) & 0x3)); }) - -/// \brief Selects four double-precision values from the 256-bit operands of +#define _mm256_shuffle_ps(a, b, mask) \ + (__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(mask)) + +/// Selects four double-precision values from the 256-bit operands of /// [4 x double], as specified by the immediate value operand. /// /// The selected elements from the first 256-bit operand are copied to bits @@ -1600,13 +1562,9 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the /// destination. /// \returns A 256-bit vector of [4 x double] containing the shuffled values. -#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \ - (__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), \ - 0 + (((mask) >> 0) & 0x1), \ - 4 + (((mask) >> 1) & 0x1), \ - 2 + (((mask) >> 2) & 0x1), \ - 6 + (((mask) >> 3) & 0x1)); }) +#define _mm256_shuffle_pd(a, b, mask) \ + (__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(mask)) /* Compare */ #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ @@ -1642,7 +1600,7 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) #define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ #define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ -/// \brief Compares each of the corresponding double-precision values of two +/// Compares each of the corresponding double-precision values of two /// 128-bit vectors of [2 x double], using the operation specified by the /// immediate integer operand. /// @@ -1665,44 +1623,44 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 0x00 : Equal (ordered, non-signaling) -/// 0x01 : Less-than (ordered, signaling) -/// 0x02 : Less-than-or-equal (ordered, signaling) -/// 0x03 : Unordered (non-signaling) -/// 0x04 : Not-equal (unordered, non-signaling) -/// 0x05 : Not-less-than (unordered, signaling) -/// 0x06 : Not-less-than-or-equal (unordered, signaling) -/// 0x07 : Ordered (non-signaling) -/// 0x08 : Equal (unordered, non-signaling) -/// 0x09 : Not-greater-than-or-equal (unordered, signaling) -/// 0x0a : Not-greater-than (unordered, signaling) -/// 0x0b : False (ordered, non-signaling) -/// 0x0c : Not-equal (ordered, non-signaling) -/// 0x0d : Greater-than-or-equal (ordered, signaling) -/// 0x0e : Greater-than (ordered, signaling) -/// 0x0f : True (unordered, non-signaling) -/// 0x10 : Equal (ordered, signaling) -/// 0x11 : Less-than (ordered, non-signaling) -/// 0x12 : Less-than-or-equal (ordered, non-signaling) -/// 0x13 : Unordered (signaling) -/// 0x14 : Not-equal (unordered, signaling) -/// 0x15 : Not-less-than (unordered, non-signaling) -/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) -/// 0x17 : Ordered (signaling) -/// 0x18 : Equal (unordered, signaling) -/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) -/// 0x1a : Not-greater-than (unordered, non-signaling) -/// 0x1b : False (ordered, signaling) -/// 0x1c : Not-equal (ordered, signaling) -/// 0x1d : Greater-than-or-equal (ordered, non-signaling) -/// 0x1e : Greater-than (ordered, non-signaling) -/// 0x1f : True (unordered, signaling) +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. -#define _mm_cmp_pd(a, b, c) __extension__ ({ \ +#define _mm_cmp_pd(a, b, c) \ (__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (c)); }) + (__v2df)(__m128d)(b), (c)) -/// \brief Compares each of the corresponding values of two 128-bit vectors of +/// Compares each of the corresponding values of two 128-bit vectors of /// [4 x float], using the operation specified by the immediate integer /// operand. /// @@ -1725,44 +1683,44 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 0x00 : Equal (ordered, non-signaling) -/// 0x01 : Less-than (ordered, signaling) -/// 0x02 : Less-than-or-equal (ordered, signaling) -/// 0x03 : Unordered (non-signaling) -/// 0x04 : Not-equal (unordered, non-signaling) -/// 0x05 : Not-less-than (unordered, signaling) -/// 0x06 : Not-less-than-or-equal (unordered, signaling) -/// 0x07 : Ordered (non-signaling) -/// 0x08 : Equal (unordered, non-signaling) -/// 0x09 : Not-greater-than-or-equal (unordered, signaling) -/// 0x0a : Not-greater-than (unordered, signaling) -/// 0x0b : False (ordered, non-signaling) -/// 0x0c : Not-equal (ordered, non-signaling) -/// 0x0d : Greater-than-or-equal (ordered, signaling) -/// 0x0e : Greater-than (ordered, signaling) -/// 0x0f : True (unordered, non-signaling) -/// 0x10 : Equal (ordered, signaling) -/// 0x11 : Less-than (ordered, non-signaling) -/// 0x12 : Less-than-or-equal (ordered, non-signaling) -/// 0x13 : Unordered (signaling) -/// 0x14 : Not-equal (unordered, signaling) -/// 0x15 : Not-less-than (unordered, non-signaling) -/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) -/// 0x17 : Ordered (signaling) -/// 0x18 : Equal (unordered, signaling) -/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) -/// 0x1a : Not-greater-than (unordered, non-signaling) -/// 0x1b : False (ordered, signaling) -/// 0x1c : Not-equal (ordered, signaling) -/// 0x1d : Greater-than-or-equal (ordered, non-signaling) -/// 0x1e : Greater-than (ordered, non-signaling) -/// 0x1f : True (unordered, signaling) +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. -#define _mm_cmp_ps(a, b, c) __extension__ ({ \ +#define _mm_cmp_ps(a, b, c) \ (__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (c)); }) + (__v4sf)(__m128)(b), (c)) -/// \brief Compares each of the corresponding double-precision values of two +/// Compares each of the corresponding double-precision values of two /// 256-bit vectors of [4 x double], using the operation specified by the /// immediate integer operand. /// @@ -1785,44 +1743,44 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 0x00 : Equal (ordered, non-signaling) -/// 0x01 : Less-than (ordered, signaling) -/// 0x02 : Less-than-or-equal (ordered, signaling) -/// 0x03 : Unordered (non-signaling) -/// 0x04 : Not-equal (unordered, non-signaling) -/// 0x05 : Not-less-than (unordered, signaling) -/// 0x06 : Not-less-than-or-equal (unordered, signaling) -/// 0x07 : Ordered (non-signaling) -/// 0x08 : Equal (unordered, non-signaling) -/// 0x09 : Not-greater-than-or-equal (unordered, signaling) -/// 0x0a : Not-greater-than (unordered, signaling) -/// 0x0b : False (ordered, non-signaling) -/// 0x0c : Not-equal (ordered, non-signaling) -/// 0x0d : Greater-than-or-equal (ordered, signaling) -/// 0x0e : Greater-than (ordered, signaling) -/// 0x0f : True (unordered, non-signaling) -/// 0x10 : Equal (ordered, signaling) -/// 0x11 : Less-than (ordered, non-signaling) -/// 0x12 : Less-than-or-equal (ordered, non-signaling) -/// 0x13 : Unordered (signaling) -/// 0x14 : Not-equal (unordered, signaling) -/// 0x15 : Not-less-than (unordered, non-signaling) -/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) -/// 0x17 : Ordered (signaling) -/// 0x18 : Equal (unordered, signaling) -/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) -/// 0x1a : Not-greater-than (unordered, non-signaling) -/// 0x1b : False (ordered, signaling) -/// 0x1c : Not-equal (ordered, signaling) -/// 0x1d : Greater-than-or-equal (ordered, non-signaling) -/// 0x1e : Greater-than (ordered, non-signaling) -/// 0x1f : True (unordered, signaling) +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) /// \returns A 256-bit vector of [4 x double] containing the comparison results. -#define _mm256_cmp_pd(a, b, c) __extension__ ({ \ +#define _mm256_cmp_pd(a, b, c) \ (__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (c)); }) + (__v4df)(__m256d)(b), (c)) -/// \brief Compares each of the corresponding values of two 256-bit vectors of +/// Compares each of the corresponding values of two 256-bit vectors of /// [8 x float], using the operation specified by the immediate integer /// operand. /// @@ -1845,44 +1803,44 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 0x00 : Equal (ordered, non-signaling) -/// 0x01 : Less-than (ordered, signaling) -/// 0x02 : Less-than-or-equal (ordered, signaling) -/// 0x03 : Unordered (non-signaling) -/// 0x04 : Not-equal (unordered, non-signaling) -/// 0x05 : Not-less-than (unordered, signaling) -/// 0x06 : Not-less-than-or-equal (unordered, signaling) -/// 0x07 : Ordered (non-signaling) -/// 0x08 : Equal (unordered, non-signaling) -/// 0x09 : Not-greater-than-or-equal (unordered, signaling) -/// 0x0a : Not-greater-than (unordered, signaling) -/// 0x0b : False (ordered, non-signaling) -/// 0x0c : Not-equal (ordered, non-signaling) -/// 0x0d : Greater-than-or-equal (ordered, signaling) -/// 0x0e : Greater-than (ordered, signaling) -/// 0x0f : True (unordered, non-signaling) -/// 0x10 : Equal (ordered, signaling) -/// 0x11 : Less-than (ordered, non-signaling) -/// 0x12 : Less-than-or-equal (ordered, non-signaling) -/// 0x13 : Unordered (signaling) -/// 0x14 : Not-equal (unordered, signaling) -/// 0x15 : Not-less-than (unordered, non-signaling) -/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) -/// 0x17 : Ordered (signaling) -/// 0x18 : Equal (unordered, signaling) -/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) -/// 0x1a : Not-greater-than (unordered, non-signaling) -/// 0x1b : False (ordered, signaling) -/// 0x1c : Not-equal (ordered, signaling) -/// 0x1d : Greater-than-or-equal (ordered, non-signaling) -/// 0x1e : Greater-than (ordered, non-signaling) -/// 0x1f : True (unordered, signaling) +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) /// \returns A 256-bit vector of [8 x float] containing the comparison results. -#define _mm256_cmp_ps(a, b, c) __extension__ ({ \ +#define _mm256_cmp_ps(a, b, c) \ (__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (c)); }) + (__v8sf)(__m256)(b), (c)) -/// \brief Compares each of the corresponding scalar double-precision values of +/// Compares each of the corresponding scalar double-precision values of /// two 128-bit vectors of [2 x double], using the operation specified by the /// immediate integer operand. /// @@ -1904,44 +1862,44 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 0x00 : Equal (ordered, non-signaling) -/// 0x01 : Less-than (ordered, signaling) -/// 0x02 : Less-than-or-equal (ordered, signaling) -/// 0x03 : Unordered (non-signaling) -/// 0x04 : Not-equal (unordered, non-signaling) -/// 0x05 : Not-less-than (unordered, signaling) -/// 0x06 : Not-less-than-or-equal (unordered, signaling) -/// 0x07 : Ordered (non-signaling) -/// 0x08 : Equal (unordered, non-signaling) -/// 0x09 : Not-greater-than-or-equal (unordered, signaling) -/// 0x0a : Not-greater-than (unordered, signaling) -/// 0x0b : False (ordered, non-signaling) -/// 0x0c : Not-equal (ordered, non-signaling) -/// 0x0d : Greater-than-or-equal (ordered, signaling) -/// 0x0e : Greater-than (ordered, signaling) -/// 0x0f : True (unordered, non-signaling) -/// 0x10 : Equal (ordered, signaling) -/// 0x11 : Less-than (ordered, non-signaling) -/// 0x12 : Less-than-or-equal (ordered, non-signaling) -/// 0x13 : Unordered (signaling) -/// 0x14 : Not-equal (unordered, signaling) -/// 0x15 : Not-less-than (unordered, non-signaling) -/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) -/// 0x17 : Ordered (signaling) -/// 0x18 : Equal (unordered, signaling) -/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) -/// 0x1a : Not-greater-than (unordered, non-signaling) -/// 0x1b : False (ordered, signaling) -/// 0x1c : Not-equal (ordered, signaling) -/// 0x1d : Greater-than-or-equal (ordered, non-signaling) -/// 0x1e : Greater-than (ordered, non-signaling) -/// 0x1f : True (unordered, signaling) +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [2 x double] containing the comparison results. -#define _mm_cmp_sd(a, b, c) __extension__ ({ \ +#define _mm_cmp_sd(a, b, c) \ (__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (c)); }) + (__v2df)(__m128d)(b), (c)) -/// \brief Compares each of the corresponding scalar values of two 128-bit +/// Compares each of the corresponding scalar values of two 128-bit /// vectors of [4 x float], using the operation specified by the immediate /// integer operand. /// @@ -1963,44 +1921,44 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// \param c /// An immediate integer operand, with bits [4:0] specifying which comparison /// operation to use: \n -/// 0x00 : Equal (ordered, non-signaling) -/// 0x01 : Less-than (ordered, signaling) -/// 0x02 : Less-than-or-equal (ordered, signaling) -/// 0x03 : Unordered (non-signaling) -/// 0x04 : Not-equal (unordered, non-signaling) -/// 0x05 : Not-less-than (unordered, signaling) -/// 0x06 : Not-less-than-or-equal (unordered, signaling) -/// 0x07 : Ordered (non-signaling) -/// 0x08 : Equal (unordered, non-signaling) -/// 0x09 : Not-greater-than-or-equal (unordered, signaling) -/// 0x0a : Not-greater-than (unordered, signaling) -/// 0x0b : False (ordered, non-signaling) -/// 0x0c : Not-equal (ordered, non-signaling) -/// 0x0d : Greater-than-or-equal (ordered, signaling) -/// 0x0e : Greater-than (ordered, signaling) -/// 0x0f : True (unordered, non-signaling) -/// 0x10 : Equal (ordered, signaling) -/// 0x11 : Less-than (ordered, non-signaling) -/// 0x12 : Less-than-or-equal (ordered, non-signaling) -/// 0x13 : Unordered (signaling) -/// 0x14 : Not-equal (unordered, signaling) -/// 0x15 : Not-less-than (unordered, non-signaling) -/// 0x16 : Not-less-than-or-equal (unordered, non-signaling) -/// 0x17 : Ordered (signaling) -/// 0x18 : Equal (unordered, signaling) -/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling) -/// 0x1a : Not-greater-than (unordered, non-signaling) -/// 0x1b : False (ordered, signaling) -/// 0x1c : Not-equal (ordered, signaling) -/// 0x1d : Greater-than-or-equal (ordered, non-signaling) -/// 0x1e : Greater-than (ordered, non-signaling) -/// 0x1f : True (unordered, signaling) +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) /// \returns A 128-bit vector of [4 x float] containing the comparison results. -#define _mm_cmp_ss(a, b, c) __extension__ ({ \ +#define _mm_cmp_ss(a, b, c) \ (__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (c)); }) + (__v4sf)(__m128)(b), (c)) -/// \brief Takes a [8 x i32] vector and returns the vector element value +/// Takes a [8 x i32] vector and returns the vector element value /// indexed by the immediate constant operand. /// /// \headerfile <x86intrin.h> @@ -2015,14 +1973,10 @@ _mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) /// element is extracted and returned. /// \returns A 32-bit integer containing the extracted 32 bits of extended /// packed data. -static __inline int __DEFAULT_FN_ATTRS -_mm256_extract_epi32(__m256i __a, const int __imm) -{ - __v8si __b = (__v8si)__a; - return __b[__imm & 7]; -} +#define _mm256_extract_epi32(X, N) \ + (int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)) -/// \brief Takes a [16 x i16] vector and returns the vector element value +/// Takes a [16 x i16] vector and returns the vector element value /// indexed by the immediate constant operand. /// /// \headerfile <x86intrin.h> @@ -2037,14 +1991,11 @@ _mm256_extract_epi32(__m256i __a, const int __imm) /// element is extracted and returned. /// \returns A 32-bit integer containing the extracted 16 bits of zero extended /// packed data. -static __inline int __DEFAULT_FN_ATTRS -_mm256_extract_epi16(__m256i __a, const int __imm) -{ - __v16hi __b = (__v16hi)__a; - return (unsigned short)__b[__imm & 15]; -} +#define _mm256_extract_epi16(X, N) \ + (int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ + (int)(N)) -/// \brief Takes a [32 x i8] vector and returns the vector element value +/// Takes a [32 x i8] vector and returns the vector element value /// indexed by the immediate constant operand. /// /// \headerfile <x86intrin.h> @@ -2059,15 +2010,12 @@ _mm256_extract_epi16(__m256i __a, const int __imm) /// element is extracted and returned. /// \returns A 32-bit integer containing the extracted 8 bits of zero extended /// packed data. -static __inline int __DEFAULT_FN_ATTRS -_mm256_extract_epi8(__m256i __a, const int __imm) -{ - __v32qi __b = (__v32qi)__a; - return (unsigned char)__b[__imm & 31]; -} +#define _mm256_extract_epi8(X, N) \ + (int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ + (int)(N)) #ifdef __x86_64__ -/// \brief Takes a [4 x i64] vector and returns the vector element value +/// Takes a [4 x i64] vector and returns the vector element value /// indexed by the immediate constant operand. /// /// \headerfile <x86intrin.h> @@ -2082,15 +2030,11 @@ _mm256_extract_epi8(__m256i __a, const int __imm) /// element is extracted and returned. /// \returns A 64-bit integer containing the extracted 64 bits of extended /// packed data. -static __inline long long __DEFAULT_FN_ATTRS -_mm256_extract_epi64(__m256i __a, const int __imm) -{ - __v4di __b = (__v4di)__a; - return __b[__imm & 3]; -} +#define _mm256_extract_epi64(X, N) \ + (long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)) #endif -/// \brief Takes a [8 x i32] vector and replaces the vector element value +/// Takes a [8 x i32] vector and replaces the vector element value /// indexed by the immediate constant operand by a new value. Returns the /// modified vector. /// @@ -2108,16 +2052,12 @@ _mm256_extract_epi64(__m256i __a, const int __imm) /// replaced. /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_insert_epi32(__m256i __a, int __b, int const __imm) -{ - __v8si __c = (__v8si)__a; - __c[__imm & 7] = __b; - return (__m256i)__c; -} +#define _mm256_insert_epi32(X, I, N) \ + (__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ + (int)(I), (int)(N)) -/// \brief Takes a [16 x i16] vector and replaces the vector element value +/// Takes a [16 x i16] vector and replaces the vector element value /// indexed by the immediate constant operand with a new value. Returns the /// modified vector. /// @@ -2135,15 +2075,11 @@ _mm256_insert_epi32(__m256i __a, int __b, int const __imm) /// replaced. /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_insert_epi16(__m256i __a, int __b, int const __imm) -{ - __v16hi __c = (__v16hi)__a; - __c[__imm & 15] = __b; - return (__m256i)__c; -} +#define _mm256_insert_epi16(X, I, N) \ + (__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ + (int)(I), (int)(N)) -/// \brief Takes a [32 x i8] vector and replaces the vector element value +/// Takes a [32 x i8] vector and replaces the vector element value /// indexed by the immediate constant operand with a new value. Returns the /// modified vector. /// @@ -2161,16 +2097,12 @@ _mm256_insert_epi16(__m256i __a, int __b, int const __imm) /// replaced. /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_insert_epi8(__m256i __a, int __b, int const __imm) -{ - __v32qi __c = (__v32qi)__a; - __c[__imm & 31] = __b; - return (__m256i)__c; -} +#define _mm256_insert_epi8(X, I, N) \ + (__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ + (int)(I), (int)(N)) #ifdef __x86_64__ -/// \brief Takes a [4 x i64] vector and replaces the vector element value +/// Takes a [4 x i64] vector and replaces the vector element value /// indexed by the immediate constant operand with a new value. Returns the /// modified vector. /// @@ -2188,17 +2120,13 @@ _mm256_insert_epi8(__m256i __a, int __b, int const __imm) /// replaced. /// \returns A copy of vector \a __a, after replacing its element indexed by /// \a __imm with \a __b. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_insert_epi64(__m256i __a, long long __b, int const __imm) -{ - __v4di __c = (__v4di)__a; - __c[__imm & 3] = __b; - return (__m256i)__c; -} +#define _mm256_insert_epi64(X, I, N) \ + (__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ + (long long)(I), (int)(N)) #endif /* Conversion */ -/// \brief Converts a vector of [4 x i32] into a vector of [4 x double]. +/// Converts a vector of [4 x i32] into a vector of [4 x double]. /// /// \headerfile <x86intrin.h> /// @@ -2213,7 +2141,7 @@ _mm256_cvtepi32_pd(__m128i __a) return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); } -/// \brief Converts a vector of [8 x i32] into a vector of [8 x float]. +/// Converts a vector of [8 x i32] into a vector of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -2225,10 +2153,10 @@ _mm256_cvtepi32_pd(__m128i __a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_cvtepi32_ps(__m256i __a) { - return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a); + return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); } -/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of /// [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2244,7 +2172,7 @@ _mm256_cvtpd_ps(__m256d __a) return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); } -/// \brief Converts a vector of [8 x float] into a vector of [8 x i32]. +/// Converts a vector of [8 x float] into a vector of [8 x i32]. /// /// \headerfile <x86intrin.h> /// @@ -2259,7 +2187,7 @@ _mm256_cvtps_epi32(__m256 __a) return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); } -/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 +/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 /// x double]. /// /// \headerfile <x86intrin.h> @@ -2275,7 +2203,7 @@ _mm256_cvtps_pd(__m128 __a) return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); } -/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 /// x i32], truncating the result by rounding towards zero when it is /// inexact. /// @@ -2292,7 +2220,7 @@ _mm256_cvttpd_epi32(__m256d __a) return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); } -/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 /// x i32]. When a conversion is inexact, the value returned is rounded /// according to the rounding control bits in the MXCSR register. /// @@ -2309,7 +2237,7 @@ _mm256_cvtpd_epi32(__m256d __a) return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); } -/// \brief Converts a vector of [8 x float] into a vector of [8 x i32], +/// Converts a vector of [8 x float] into a vector of [8 x i32], /// truncating the result by rounding towards zero when it is inexact. /// /// \headerfile <x86intrin.h> @@ -2325,7 +2253,7 @@ _mm256_cvttps_epi32(__m256 __a) return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); } -/// \brief Returns the first element of the input vector of [4 x double]. +/// Returns the first element of the input vector of [4 x double]. /// /// \headerfile <avxintrin.h> /// @@ -2341,7 +2269,7 @@ _mm256_cvtsd_f64(__m256d __a) return __a[0]; } -/// \brief Returns the first element of the input vector of [8 x i32]. +/// Returns the first element of the input vector of [8 x i32]. /// /// \headerfile <avxintrin.h> /// @@ -2358,7 +2286,7 @@ _mm256_cvtsi256_si32(__m256i __a) return __b[0]; } -/// \brief Returns the first element of the input vector of [8 x float]. +/// Returns the first element of the input vector of [8 x float]. /// /// \headerfile <avxintrin.h> /// @@ -2375,9 +2303,8 @@ _mm256_cvtss_f32(__m256 __a) } /* Vector replicate */ -/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit -/// vector of [8 x float] to float values in a 256-bit vector of -/// [8 x float]. +/// Moves and duplicates odd-indexed values from a 256-bit vector of +/// [8 x float] to float values in a 256-bit vector of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -2401,8 +2328,8 @@ _mm256_movehdup_ps(__m256 __a) return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); } -/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit -/// vector of [8 x float] to float values in a 256-bit vector of [8 x float]. +/// Moves and duplicates even-indexed values from a 256-bit vector of +/// [8 x float] to float values in a 256-bit vector of [8 x float]. /// /// \headerfile <x86intrin.h> /// @@ -2426,7 +2353,7 @@ _mm256_moveldup_ps(__m256 __a) return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); } -/// \brief Moves and duplicates double-precision floating point values from a +/// Moves and duplicates double-precision floating point values from a /// 256-bit vector of [4 x double] to double-precision values in a 256-bit /// vector of [4 x double]. /// @@ -2449,7 +2376,7 @@ _mm256_movedup_pd(__m256d __a) } /* Unpack and Interleave */ -/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of +/// Unpacks the odd-indexed vector elements from two 256-bit vectors of /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -2471,7 +2398,7 @@ _mm256_unpackhi_pd(__m256d __a, __m256d __b) return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); } -/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of +/// Unpacks the even-indexed vector elements from two 256-bit vectors of /// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -2493,7 +2420,7 @@ _mm256_unpacklo_pd(__m256d __a, __m256d __b) return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); } -/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the +/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit /// vector of [8 x float]. /// @@ -2520,7 +2447,7 @@ _mm256_unpackhi_ps(__m256 __a, __m256 __b) return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); } -/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the +/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the /// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit /// vector of [8 x float]. /// @@ -2548,7 +2475,7 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b) } /* Bit Test */ -/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an +/// Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2571,13 +2498,13 @@ _mm256_unpacklo_ps(__m256 __a, __m256 __b) /// \param __b /// A 128-bit vector of [2 x double]. /// \returns the ZF flag in the EFLAGS register. -static __inline int __DEFAULT_FN_ATTRS +static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); } -/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an +/// Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2600,13 +2527,13 @@ _mm_testz_pd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. /// \returns the CF flag in the EFLAGS register. -static __inline int __DEFAULT_FN_ATTRS +static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); } -/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an +/// Given two 128-bit floating-point vectors of [2 x double], perform an /// element-by-element comparison of the double-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2630,13 +2557,13 @@ _mm_testc_pd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [2 x double]. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS +static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_pd(__m128d __a, __m128d __b) { return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); } -/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an +/// Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2659,13 +2586,13 @@ _mm_testnzc_pd(__m128d __a, __m128d __b) /// \param __b /// A 128-bit vector of [4 x float]. /// \returns the ZF flag. -static __inline int __DEFAULT_FN_ATTRS +static __inline int __DEFAULT_FN_ATTRS128 _mm_testz_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); } -/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an +/// Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2688,13 +2615,13 @@ _mm_testz_ps(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. /// \returns the CF flag. -static __inline int __DEFAULT_FN_ATTRS +static __inline int __DEFAULT_FN_ATTRS128 _mm_testc_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); } -/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an +/// Given two 128-bit floating-point vectors of [4 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2718,13 +2645,13 @@ _mm_testc_ps(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. /// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS +static __inline int __DEFAULT_FN_ATTRS128 _mm_testnzc_ps(__m128 __a, __m128 __b) { return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); } -/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an +/// Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source /// vector. @@ -2753,7 +2680,7 @@ _mm256_testz_pd(__m256d __a, __m256d __b) return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); } -/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an +/// Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source /// vector. @@ -2782,7 +2709,7 @@ _mm256_testc_pd(__m256d __a, __m256d __b) return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); } -/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an +/// Given two 256-bit floating-point vectors of [4 x double], perform an /// element-by-element comparison of the double-precision elements in the /// first source vector and the corresponding elements in the second source /// vector. @@ -2812,7 +2739,7 @@ _mm256_testnzc_pd(__m256d __a, __m256d __b) return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); } -/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an +/// Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2841,7 +2768,7 @@ _mm256_testz_ps(__m256 __a, __m256 __b) return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an +/// Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision element in the /// first source vector and the corresponding element in the second source /// vector. @@ -2870,7 +2797,7 @@ _mm256_testc_ps(__m256 __a, __m256 __b) return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an +/// Given two 256-bit floating-point vectors of [8 x float], perform an /// element-by-element comparison of the single-precision elements in the /// first source vector and the corresponding elements in the second source /// vector. @@ -2900,7 +2827,7 @@ _mm256_testnzc_ps(__m256 __a, __m256 __b) return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); } -/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison /// of the two source vectors. /// /// The EFLAGS register is updated as follows: \n @@ -2926,7 +2853,7 @@ _mm256_testz_si256(__m256i __a, __m256i __b) return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); } -/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison /// of the two source vectors. /// /// The EFLAGS register is updated as follows: \n @@ -2952,7 +2879,7 @@ _mm256_testc_si256(__m256i __a, __m256i __b) return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); } -/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison /// of the two source vectors. /// /// The EFLAGS register is updated as follows: \n @@ -2980,7 +2907,7 @@ _mm256_testnzc_si256(__m256i __a, __m256i __b) } /* Vector extract sign mask */ -/// \brief Extracts the sign bits of double-precision floating point elements +/// Extracts the sign bits of double-precision floating point elements /// in a 256-bit vector of [4 x double] and writes them to the lower order /// bits of the return value. /// @@ -2998,7 +2925,7 @@ _mm256_movemask_pd(__m256d __a) return __builtin_ia32_movmskpd256((__v4df)__a); } -/// \brief Extracts the sign bits of double-precision floating point elements +/// Extracts the sign bits of single-precision floating point elements /// in a 256-bit vector of [8 x float] and writes them to the lower order /// bits of the return value. /// @@ -3007,7 +2934,7 @@ _mm256_movemask_pd(__m256d __a) /// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction. /// /// \param __a -/// A 256-bit vector of [8 x float] containing the double-precision floating +/// A 256-bit vector of [8 x float] containing the single-precision floating /// point values with sign bits to be extracted. /// \returns The sign bits from the operand, written to bits [7:0]. static __inline int __DEFAULT_FN_ATTRS @@ -3017,30 +2944,30 @@ _mm256_movemask_ps(__m256 __a) } /* Vector __zero */ -/// \brief Zeroes the contents of all XMM or YMM registers. +/// Zeroes the contents of all XMM or YMM registers. /// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VZEROALL </c> instruction. -static __inline void __DEFAULT_FN_ATTRS +static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) _mm256_zeroall(void) { __builtin_ia32_vzeroall(); } -/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers. +/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers. /// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction. -static __inline void __DEFAULT_FN_ATTRS +static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) _mm256_zeroupper(void) { __builtin_ia32_vzeroupper(); } /* Vector load with broadcast */ -/// \brief Loads a scalar single-precision floating point value from the +/// Loads a scalar single-precision floating point value from the /// specified address pointed to by \a __a and broadcasts it to the elements /// of a [4 x float] vector. /// @@ -3052,14 +2979,14 @@ _mm256_zeroupper(void) /// The single-precision floating point value to be broadcast. /// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set /// equal to the broadcast value. -static __inline __m128 __DEFAULT_FN_ATTRS +static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_broadcast_ss(float const *__a) { float __f = *__a; - return (__m128)(__v4sf){ __f, __f, __f, __f }; + return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f }; } -/// \brief Loads a scalar double-precision floating point value from the +/// Loads a scalar double-precision floating point value from the /// specified address pointed to by \a __a and broadcasts it to the elements /// of a [4 x double] vector. /// @@ -3075,10 +3002,10 @@ static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_sd(double const *__a) { double __d = *__a; - return (__m256d)(__v4df){ __d, __d, __d, __d }; + return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d }; } -/// \brief Loads a scalar single-precision floating point value from the +/// Loads a scalar single-precision floating point value from the /// specified address pointed to by \a __a and broadcasts it to the elements /// of a [8 x float] vector. /// @@ -3094,10 +3021,10 @@ static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ss(float const *__a) { float __f = *__a; - return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; + return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; } -/// \brief Loads the data from a 128-bit vector of [2 x double] from the +/// Loads the data from a 128-bit vector of [2 x double] from the /// specified address pointed to by \a __a and broadcasts it to 128-bit /// elements in a 256-bit vector of [4 x double]. /// @@ -3112,10 +3039,12 @@ _mm256_broadcast_ss(float const *__a) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_broadcast_pd(__m128d const *__a) { - return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a); + __m128d __b = _mm_loadu_pd((const double *)__a); + return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, + 0, 1, 0, 1); } -/// \brief Loads the data from a 128-bit vector of [4 x float] from the +/// Loads the data from a 128-bit vector of [4 x float] from the /// specified address pointed to by \a __a and broadcasts it to 128-bit /// elements in a 256-bit vector of [8 x float]. /// @@ -3130,11 +3059,13 @@ _mm256_broadcast_pd(__m128d const *__a) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_broadcast_ps(__m128 const *__a) { - return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a); + __m128 __b = _mm_loadu_ps((const float *)__a); + return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, + 0, 1, 2, 3, 0, 1, 2, 3); } /* SIMD load ops */ -/// \brief Loads 4 double-precision floating point values from a 32-byte aligned +/// Loads 4 double-precision floating point values from a 32-byte aligned /// memory location pointed to by \a __p into a vector of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -3151,7 +3082,7 @@ _mm256_load_pd(double const *__p) return *(__m256d *)__p; } -/// \brief Loads 8 single-precision floating point values from a 32-byte aligned +/// Loads 8 single-precision floating point values from a 32-byte aligned /// memory location pointed to by \a __p into a vector of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -3167,7 +3098,7 @@ _mm256_load_ps(float const *__p) return *(__m256 *)__p; } -/// \brief Loads 4 double-precision floating point values from an unaligned +/// Loads 4 double-precision floating point values from an unaligned /// memory location pointed to by \a __p into a vector of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -3187,7 +3118,7 @@ _mm256_loadu_pd(double const *__p) return ((struct __loadu_pd*)__p)->__v; } -/// \brief Loads 8 single-precision floating point values from an unaligned +/// Loads 8 single-precision floating point values from an unaligned /// memory location pointed to by \a __p into a vector of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -3207,7 +3138,7 @@ _mm256_loadu_ps(float const *__p) return ((struct __loadu_ps*)__p)->__v; } -/// \brief Loads 256 bits of integer data from a 32-byte aligned memory +/// Loads 256 bits of integer data from a 32-byte aligned memory /// location pointed to by \a __p into elements of a 256-bit integer vector. /// /// \headerfile <x86intrin.h> @@ -3224,7 +3155,7 @@ _mm256_load_si256(__m256i const *__p) return *__p; } -/// \brief Loads 256 bits of integer data from an unaligned memory location +/// Loads 256 bits of integer data from an unaligned memory location /// pointed to by \a __p into a 256-bit integer vector. /// /// \headerfile <x86intrin.h> @@ -3243,7 +3174,7 @@ _mm256_loadu_si256(__m256i const *__p) return ((struct __loadu_si256*)__p)->__v; } -/// \brief Loads 256 bits of integer data from an unaligned memory location +/// Loads 256 bits of integer data from an unaligned memory location /// pointed to by \a __p into a 256-bit integer vector. This intrinsic may /// perform better than \c _mm256_loadu_si256 when the data crosses a cache /// line boundary. @@ -3262,7 +3193,7 @@ _mm256_lddqu_si256(__m256i const *__p) } /* SIMD store ops */ -/// \brief Stores double-precision floating point values from a 256-bit vector +/// Stores double-precision floating point values from a 256-bit vector /// of [4 x double] to a 32-byte aligned memory location pointed to by /// \a __p. /// @@ -3281,7 +3212,7 @@ _mm256_store_pd(double *__p, __m256d __a) *(__m256d *)__p = __a; } -/// \brief Stores single-precision floating point values from a 256-bit vector +/// Stores single-precision floating point values from a 256-bit vector /// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. /// /// \headerfile <x86intrin.h> @@ -3299,7 +3230,7 @@ _mm256_store_ps(float *__p, __m256 __a) *(__m256 *)__p = __a; } -/// \brief Stores double-precision floating point values from a 256-bit vector +/// Stores double-precision floating point values from a 256-bit vector /// of [4 x double] to an unaligned memory location pointed to by \a __p. /// /// \headerfile <x86intrin.h> @@ -3320,7 +3251,7 @@ _mm256_storeu_pd(double *__p, __m256d __a) ((struct __storeu_pd*)__p)->__v = __a; } -/// \brief Stores single-precision floating point values from a 256-bit vector +/// Stores single-precision floating point values from a 256-bit vector /// of [8 x float] to an unaligned memory location pointed to by \a __p. /// /// \headerfile <x86intrin.h> @@ -3340,7 +3271,7 @@ _mm256_storeu_ps(float *__p, __m256 __a) ((struct __storeu_ps*)__p)->__v = __a; } -/// \brief Stores integer values from a 256-bit integer vector to a 32-byte +/// Stores integer values from a 256-bit integer vector to a 32-byte /// aligned memory location pointed to by \a __p. /// /// \headerfile <x86intrin.h> @@ -3358,7 +3289,7 @@ _mm256_store_si256(__m256i *__p, __m256i __a) *__p = __a; } -/// \brief Stores integer values from a 256-bit integer vector to an unaligned +/// Stores integer values from a 256-bit integer vector to an unaligned /// memory location pointed to by \a __p. /// /// \headerfile <x86intrin.h> @@ -3379,7 +3310,7 @@ _mm256_storeu_si256(__m256i *__p, __m256i __a) } /* Conditional load ops */ -/// \brief Conditionally loads double-precision floating point elements from a +/// Conditionally loads double-precision floating point elements from a /// memory location pointed to by \a __p into a 128-bit vector of /// [2 x double], depending on the mask bits associated with each data /// element. @@ -3397,13 +3328,13 @@ _mm256_storeu_si256(__m256i *__p, __m256i __a) /// corresponding value in the memory location is not loaded and the /// corresponding field in the return value is set to zero. /// \returns A 128-bit vector of [2 x double] containing the loaded values. -static __inline __m128d __DEFAULT_FN_ATTRS +static __inline __m128d __DEFAULT_FN_ATTRS128 _mm_maskload_pd(double const *__p, __m128i __m) { return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); } -/// \brief Conditionally loads double-precision floating point elements from a +/// Conditionally loads double-precision floating point elements from a /// memory location pointed to by \a __p into a 256-bit vector of /// [4 x double], depending on the mask bits associated with each data /// element. @@ -3428,7 +3359,7 @@ _mm256_maskload_pd(double const *__p, __m256i __m) (__v4di)__m); } -/// \brief Conditionally loads single-precision floating point elements from a +/// Conditionally loads single-precision floating point elements from a /// memory location pointed to by \a __p into a 128-bit vector of /// [4 x float], depending on the mask bits associated with each data /// element. @@ -3446,13 +3377,13 @@ _mm256_maskload_pd(double const *__p, __m256i __m) /// corresponding value in the memory location is not loaded and the /// corresponding field in the return value is set to zero. /// \returns A 128-bit vector of [4 x float] containing the loaded values. -static __inline __m128 __DEFAULT_FN_ATTRS +static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_maskload_ps(float const *__p, __m128i __m) { return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); } -/// \brief Conditionally loads single-precision floating point elements from a +/// Conditionally loads single-precision floating point elements from a /// memory location pointed to by \a __p into a 256-bit vector of /// [8 x float], depending on the mask bits associated with each data /// element. @@ -3477,7 +3408,7 @@ _mm256_maskload_ps(float const *__p, __m256i __m) } /* Conditional store ops */ -/// \brief Moves single-precision floating point values from a 256-bit vector +/// Moves single-precision floating point values from a 256-bit vector /// of [8 x float] to a memory location pointed to by \a __p, according to /// the specified mask. /// @@ -3501,7 +3432,7 @@ _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); } -/// \brief Moves double-precision values from a 128-bit vector of [2 x double] +/// Moves double-precision values from a 128-bit vector of [2 x double] /// to a memory location pointed to by \a __p, according to the specified /// mask. /// @@ -3519,13 +3450,13 @@ _mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) /// changed. /// \param __a /// A 128-bit vector of [2 x double] containing the values to be stored. -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) { __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); } -/// \brief Moves double-precision values from a 256-bit vector of [4 x double] +/// Moves double-precision values from a 256-bit vector of [4 x double] /// to a memory location pointed to by \a __p, according to the specified /// mask. /// @@ -3549,7 +3480,7 @@ _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); } -/// \brief Moves single-precision floating point values from a 128-bit vector +/// Moves single-precision floating point values from a 128-bit vector /// of [4 x float] to a memory location pointed to by \a __p, according to /// the specified mask. /// @@ -3567,14 +3498,14 @@ _mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) /// changed. /// \param __a /// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline void __DEFAULT_FN_ATTRS +static __inline void __DEFAULT_FN_ATTRS128 _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) { __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); } /* Cacheability support ops */ -/// \brief Moves integer data from a 256-bit integer vector to a 32-byte +/// Moves integer data from a 256-bit integer vector to a 32-byte /// aligned memory location. To minimize caching, the data is flagged as /// non-temporal (unlikely to be used again soon). /// @@ -3594,7 +3525,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b) __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); } -/// \brief Moves double-precision values from a 256-bit vector of [4 x double] +/// Moves double-precision values from a 256-bit vector of [4 x double] /// to a 32-byte aligned memory location. To minimize caching, the data is /// flagged as non-temporal (unlikely to be used again soon). /// @@ -3614,7 +3545,7 @@ _mm256_stream_pd(double *__a, __m256d __b) __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); } -/// \brief Moves single-precision floating point values from a 256-bit vector +/// Moves single-precision floating point values from a 256-bit vector /// of [8 x float] to a 32-byte aligned memory location. To minimize /// caching, the data is flagged as non-temporal (unlikely to be used again /// soon). @@ -3636,7 +3567,7 @@ _mm256_stream_ps(float *__p, __m256 __a) } /* Create vectors */ -/// \brief Create a 256-bit vector of [4 x double] with undefined values. +/// Create a 256-bit vector of [4 x double] with undefined values. /// /// \headerfile <x86intrin.h> /// @@ -3649,7 +3580,7 @@ _mm256_undefined_pd(void) return (__m256d)__builtin_ia32_undef256(); } -/// \brief Create a 256-bit vector of [8 x float] with undefined values. +/// Create a 256-bit vector of [8 x float] with undefined values. /// /// \headerfile <x86intrin.h> /// @@ -3662,7 +3593,7 @@ _mm256_undefined_ps(void) return (__m256)__builtin_ia32_undef256(); } -/// \brief Create a 256-bit integer vector with undefined values. +/// Create a 256-bit integer vector with undefined values. /// /// \headerfile <x86intrin.h> /// @@ -3675,7 +3606,7 @@ _mm256_undefined_si256(void) return (__m256i)__builtin_ia32_undef256(); } -/// \brief Constructs a 256-bit floating-point vector of [4 x double] +/// Constructs a 256-bit floating-point vector of [4 x double] /// initialized with the specified double-precision floating-point values. /// /// \headerfile <x86intrin.h> @@ -3699,10 +3630,10 @@ _mm256_undefined_si256(void) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_pd(double __a, double __b, double __c, double __d) { - return (__m256d){ __d, __c, __b, __a }; + return __extension__ (__m256d){ __d, __c, __b, __a }; } -/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized +/// Constructs a 256-bit floating-point vector of [8 x float] initialized /// with the specified single-precision floating-point values. /// /// \headerfile <x86intrin.h> @@ -3739,10 +3670,10 @@ static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) { - return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; + return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; } -/// \brief Constructs a 256-bit integer vector initialized with the specified +/// Constructs a 256-bit integer vector initialized with the specified /// 32-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3771,10 +3702,10 @@ static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) { - return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; + return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; } -/// \brief Constructs a 256-bit integer vector initialized with the specified +/// Constructs a 256-bit integer vector initialized with the specified /// 16-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3821,11 +3752,11 @@ _mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00) { - return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, + return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; } -/// \brief Constructs a 256-bit integer vector initialized with the specified +/// Constructs a 256-bit integer vector initialized with the specified /// 8-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3908,7 +3839,7 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00) { - return (__m256i)(__v32qi){ + return __extension__ (__m256i)(__v32qi){ __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, @@ -3916,7 +3847,7 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, }; } -/// \brief Constructs a 256-bit integer vector initialized with the specified +/// Constructs a 256-bit integer vector initialized with the specified /// 64-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3936,11 +3867,11 @@ _mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) { - return (__m256i)(__v4di){ __d, __c, __b, __a }; + return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; } /* Create vectors with elements in reverse order */ -/// \brief Constructs a 256-bit floating-point vector of [4 x double], +/// Constructs a 256-bit floating-point vector of [4 x double], /// initialized in reverse order with the specified double-precision /// floating-point values. /// @@ -3965,10 +3896,10 @@ _mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_pd(double __a, double __b, double __c, double __d) { - return (__m256d){ __a, __b, __c, __d }; + return _mm256_set_pd(__d, __c, __b, __a); } -/// \brief Constructs a 256-bit floating-point vector of [8 x float], +/// Constructs a 256-bit floating-point vector of [8 x float], /// initialized in reverse order with the specified single-precision /// float-point values. /// @@ -4006,10 +3937,10 @@ static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setr_ps(float __a, float __b, float __c, float __d, float __e, float __f, float __g, float __h) { - return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h }; + return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); } -/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// Constructs a 256-bit integer vector, initialized in reverse order /// with the specified 32-bit integral values. /// /// \headerfile <x86intrin.h> @@ -4038,10 +3969,10 @@ static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, int __i4, int __i5, int __i6, int __i7) { - return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 }; + return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); } -/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// Constructs a 256-bit integer vector, initialized in reverse order /// with the specified 16-bit integral values. /// /// \headerfile <x86intrin.h> @@ -4088,11 +4019,13 @@ _mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, short __w07, short __w06, short __w05, short __w04, short __w03, short __w02, short __w01, short __w00) { - return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09, - __w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 }; + return _mm256_set_epi16(__w00, __w01, __w02, __w03, + __w04, __w05, __w06, __w07, + __w08, __w09, __w10, __w11, + __w12, __w13, __w14, __w15); } -/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// Constructs a 256-bit integer vector, initialized in reverse order /// with the specified 8-bit integral values. /// /// \headerfile <x86intrin.h> @@ -4175,14 +4108,13 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, char __b07, char __b06, char __b05, char __b04, char __b03, char __b02, char __b01, char __b00) { - return (__m256i)(__v32qi){ - __b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24, - __b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16, - __b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08, - __b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 }; + return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, + __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, + __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, + __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31); } -/// \brief Constructs a 256-bit integer vector, initialized in reverse order +/// Constructs a 256-bit integer vector, initialized in reverse order /// with the specified 64-bit integral values. /// /// \headerfile <x86intrin.h> @@ -4202,11 +4134,11 @@ _mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) { - return (__m256i)(__v4di){ __a, __b, __c, __d }; + return _mm256_set_epi64x(__d, __c, __b, __a); } /* Create vectors with repeated elements */ -/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each +/// Constructs a 256-bit floating-point vector of [4 x double], with each /// of the four double-precision floating-point vector elements set to the /// specified double-precision floating-point value. /// @@ -4221,10 +4153,10 @@ _mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set1_pd(double __w) { - return (__m256d){ __w, __w, __w, __w }; + return _mm256_set_pd(__w, __w, __w, __w); } -/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each +/// Constructs a 256-bit floating-point vector of [8 x float], with each /// of the eight single-precision floating-point vector elements set to the /// specified single-precision floating-point value. /// @@ -4240,10 +4172,10 @@ _mm256_set1_pd(double __w) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_set1_ps(float __w) { - return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w }; + return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); } -/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the +/// Constructs a 256-bit integer vector of [8 x i32], with each of the /// 32-bit integral vector elements set to the specified 32-bit integral /// value. /// @@ -4259,10 +4191,10 @@ _mm256_set1_ps(float __w) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi32(int __i) { - return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i }; + return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); } -/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the +/// Constructs a 256-bit integer vector of [16 x i16], with each of the /// 16-bit integral vector elements set to the specified 16-bit integral /// value. /// @@ -4277,11 +4209,11 @@ _mm256_set1_epi32(int __i) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi16(short __w) { - return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w }; + return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w); } -/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the +/// Constructs a 256-bit integer vector of [32 x i8], with each of the /// 8-bit integral vector elements set to the specified 8-bit integral value. /// /// \headerfile <x86intrin.h> @@ -4295,12 +4227,13 @@ _mm256_set1_epi16(short __w) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi8(char __b) { - return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, - __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, - __b, __b, __b, __b, __b, __b, __b }; + return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, + __b, __b, __b, __b, __b, __b, __b, __b, + __b, __b, __b, __b, __b, __b, __b, __b, + __b, __b, __b, __b, __b, __b, __b, __b); } -/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the +/// Constructs a 256-bit integer vector of [4 x i64], with each of the /// 64-bit integral vector elements set to the specified 64-bit integral /// value. /// @@ -4315,11 +4248,11 @@ _mm256_set1_epi8(char __b) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set1_epi64x(long long __q) { - return (__m256i)(__v4di){ __q, __q, __q, __q }; + return _mm256_set_epi64x(__q, __q, __q, __q); } /* Create __zeroed vectors */ -/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all +/// Constructs a 256-bit floating-point vector of [4 x double] with all /// vector elements initialized to zero. /// /// \headerfile <x86intrin.h> @@ -4330,10 +4263,10 @@ _mm256_set1_epi64x(long long __q) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setzero_pd(void) { - return (__m256d){ 0, 0, 0, 0 }; + return __extension__ (__m256d){ 0, 0, 0, 0 }; } -/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all +/// Constructs a 256-bit floating-point vector of [8 x float] with all /// vector elements initialized to zero. /// /// \headerfile <x86intrin.h> @@ -4344,10 +4277,10 @@ _mm256_setzero_pd(void) static __inline __m256 __DEFAULT_FN_ATTRS _mm256_setzero_ps(void) { - return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; + return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; } -/// \brief Constructs a 256-bit integer vector initialized to zero. +/// Constructs a 256-bit integer vector initialized to zero. /// /// \headerfile <x86intrin.h> /// @@ -4357,11 +4290,11 @@ _mm256_setzero_ps(void) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setzero_si256(void) { - return (__m256i){ 0LL, 0LL, 0LL, 0LL }; + return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; } /* Cast between vector types */ -/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit +/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit /// floating-point vector of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -4378,7 +4311,7 @@ _mm256_castpd_ps(__m256d __a) return (__m256)__a; } -/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit +/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit /// integer vector. /// /// \headerfile <x86intrin.h> @@ -4395,7 +4328,7 @@ _mm256_castpd_si256(__m256d __a) return (__m256i)__a; } -/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit +/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit /// floating-point vector of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -4412,7 +4345,7 @@ _mm256_castps_pd(__m256 __a) return (__m256d)__a; } -/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit +/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit /// integer vector. /// /// \headerfile <x86intrin.h> @@ -4429,7 +4362,7 @@ _mm256_castps_si256(__m256 __a) return (__m256i)__a; } -/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector +/// Casts a 256-bit integer vector into a 256-bit floating-point vector /// of [8 x float]. /// /// \headerfile <x86intrin.h> @@ -4446,7 +4379,7 @@ _mm256_castsi256_ps(__m256i __a) return (__m256)__a; } -/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector +/// Casts a 256-bit integer vector into a 256-bit floating-point vector /// of [4 x double]. /// /// \headerfile <x86intrin.h> @@ -4463,7 +4396,7 @@ _mm256_castsi256_pd(__m256i __a) return (__m256d)__a; } -/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of +/// Returns the lower 128 bits of a 256-bit floating-point vector of /// [4 x double] as a 128-bit floating-point vector of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -4480,7 +4413,7 @@ _mm256_castpd256_pd128(__m256d __a) return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); } -/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of +/// Returns the lower 128 bits of a 256-bit floating-point vector of /// [8 x float] as a 128-bit floating-point vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -4497,7 +4430,7 @@ _mm256_castps256_ps128(__m256 __a) return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); } -/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector. +/// Truncates a 256-bit integer vector into a 128-bit integer vector. /// /// \headerfile <x86intrin.h> /// @@ -4513,7 +4446,7 @@ _mm256_castsi256_si128(__m256i __a) return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); } -/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a +/// Constructs a 256-bit floating-point vector of [4 x double] from a /// 128-bit floating-point vector of [2 x double]. /// /// The lower 128 bits contain the value of the source vector. The contents @@ -4534,7 +4467,7 @@ _mm256_castpd128_pd256(__m128d __a) return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); } -/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a +/// Constructs a 256-bit floating-point vector of [8 x float] from a /// 128-bit floating-point vector of [4 x float]. /// /// The lower 128 bits contain the value of the source vector. The contents @@ -4555,7 +4488,7 @@ _mm256_castps128_ps256(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); } -/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// Constructs a 256-bit integer vector from a 128-bit integer vector. /// /// The lower 128 bits contain the value of the source vector. The contents /// of the upper 128 bits are undefined. @@ -4574,7 +4507,7 @@ _mm256_castsi128_si256(__m128i __a) return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); } -/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a +/// Constructs a 256-bit floating-point vector of [4 x double] from a /// 128-bit floating-point vector of [2 x double]. The lower 128 bits /// contain the value of the source vector. The upper 128 bits are set /// to zero. @@ -4593,7 +4526,7 @@ _mm256_zextpd128_pd256(__m128d __a) return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); } -/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a +/// Constructs a 256-bit floating-point vector of [8 x float] from a /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain /// the value of the source vector. The upper 128 bits are set to zero. /// @@ -4611,7 +4544,7 @@ _mm256_zextps128_ps256(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); } -/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector. +/// Constructs a 256-bit integer vector from a 128-bit integer vector. /// The lower 128 bits contain the value of the source vector. The upper /// 128 bits are set to zero. /// @@ -4634,7 +4567,7 @@ _mm256_zextsi128_si256(__m128i __a) We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ -/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating +/// Constructs a new 256-bit vector of [8 x float] by first duplicating /// a 256-bit vector of [8 x float] given in the first parameter, and then /// replacing either the upper or the lower 128 bits with the contents of a /// 128-bit vector of [4 x float] in the second parameter. @@ -4668,20 +4601,11 @@ _mm256_zextsi128_si256(__m128i __a) /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the /// result. /// \returns A 256-bit vector of [8 x float] containing the interleaved values. -#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \ - (__m256)__builtin_shufflevector( \ - (__v8sf)(__m256)(V1), \ - (__v8sf)_mm256_castps128_ps256((__m128)(V2)), \ - (((M) & 1) ? 0 : 8), \ - (((M) & 1) ? 1 : 9), \ - (((M) & 1) ? 2 : 10), \ - (((M) & 1) ? 3 : 11), \ - (((M) & 1) ? 8 : 4), \ - (((M) & 1) ? 9 : 5), \ - (((M) & 1) ? 10 : 6), \ - (((M) & 1) ? 11 : 7) );}) - -/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating +#define _mm256_insertf128_ps(V1, V2, M) \ + (__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ + (__v4sf)(__m128)(V2), (int)(M)) + +/// Constructs a new 256-bit vector of [4 x double] by first duplicating /// a 256-bit vector of [4 x double] given in the first parameter, and then /// replacing either the upper or the lower 128 bits with the contents of a /// 128-bit vector of [2 x double] in the second parameter. @@ -4715,16 +4639,11 @@ _mm256_zextsi128_si256(__m128i __a) /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the /// result. /// \returns A 256-bit vector of [4 x double] containing the interleaved values. -#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \ - (__m256d)__builtin_shufflevector( \ - (__v4df)(__m256d)(V1), \ - (__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \ - (((M) & 1) ? 0 : 4), \ - (((M) & 1) ? 1 : 5), \ - (((M) & 1) ? 4 : 2), \ - (((M) & 1) ? 5 : 3) );}) - -/// \brief Constructs a new 256-bit integer vector by first duplicating a +#define _mm256_insertf128_pd(V1, V2, M) \ + (__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M)) + +/// Constructs a new 256-bit integer vector by first duplicating a /// 256-bit integer vector given in the first parameter, and then replacing /// either the upper or the lower 128 bits with the contents of a 128-bit /// integer vector in the second parameter. @@ -4758,21 +4677,16 @@ _mm256_zextsi128_si256(__m128i __a) /// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the /// result. /// \returns A 256-bit integer vector containing the interleaved values. -#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \ - (__m256i)__builtin_shufflevector( \ - (__v4di)(__m256i)(V1), \ - (__v4di)_mm256_castsi128_si256((__m128i)(V2)), \ - (((M) & 1) ? 0 : 4), \ - (((M) & 1) ? 1 : 5), \ - (((M) & 1) ? 4 : 2), \ - (((M) & 1) ? 5 : 3) );}) +#define _mm256_insertf128_si256(V1, V2, M) \ + (__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M)) /* Vector extract. We use macros rather than inlines because we only want to accept invocations where the immediate M is a constant expression. */ -/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector +/// Extracts either the upper or the lower 128 bits from a 256-bit vector /// of [8 x float], as determined by the immediate integer parameter, and /// returns the extracted bits as a 128-bit vector of [4 x float]. /// @@ -4793,16 +4707,10 @@ _mm256_zextsi128_si256(__m128i __a) /// result. \n /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// \returns A 128-bit vector of [4 x float] containing the extracted bits. -#define _mm256_extractf128_ps(V, M) __extension__ ({ \ - (__m128)__builtin_shufflevector( \ - (__v8sf)(__m256)(V), \ - (__v8sf)(_mm256_undefined_ps()), \ - (((M) & 1) ? 4 : 0), \ - (((M) & 1) ? 5 : 1), \ - (((M) & 1) ? 6 : 2), \ - (((M) & 1) ? 7 : 3) );}) - -/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector +#define _mm256_extractf128_ps(V, M) \ + (__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)) + +/// Extracts either the upper or the lower 128 bits from a 256-bit vector /// of [4 x double], as determined by the immediate integer parameter, and /// returns the extracted bits as a 128-bit vector of [2 x double]. /// @@ -4823,14 +4731,10 @@ _mm256_zextsi128_si256(__m128i __a) /// result. \n /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// \returns A 128-bit vector of [2 x double] containing the extracted bits. -#define _mm256_extractf128_pd(V, M) __extension__ ({ \ - (__m128d)__builtin_shufflevector( \ - (__v4df)(__m256d)(V), \ - (__v4df)(_mm256_undefined_pd()), \ - (((M) & 1) ? 2 : 0), \ - (((M) & 1) ? 3 : 1) );}) - -/// \brief Extracts either the upper or the lower 128 bits from a 256-bit +#define _mm256_extractf128_pd(V, M) \ + (__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)) + +/// Extracts either the upper or the lower 128 bits from a 256-bit /// integer vector, as determined by the immediate integer parameter, and /// returns the extracted bits as a 128-bit integer vector. /// @@ -4851,15 +4755,11 @@ _mm256_zextsi128_si256(__m128i __a) /// result. \n /// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. /// \returns A 128-bit integer vector containing the extracted bits. -#define _mm256_extractf128_si256(V, M) __extension__ ({ \ - (__m128i)__builtin_shufflevector( \ - (__v4di)(__m256i)(V), \ - (__v4di)(_mm256_undefined_si256()), \ - (((M) & 1) ? 2 : 0), \ - (((M) & 1) ? 3 : 1) );}) +#define _mm256_extractf128_si256(V, M) \ + (__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)) /* SIMD load ops (unaligned) */ -/// \brief Loads two 128-bit floating-point vectors of [4 x float] from +/// Loads two 128-bit floating-point vectors of [4 x float] from /// unaligned memory locations and constructs a 256-bit floating-point vector /// of [8 x float] by concatenating the two 128-bit vectors. /// @@ -4887,7 +4787,7 @@ _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1); } -/// \brief Loads two 128-bit floating-point vectors of [2 x double] from +/// Loads two 128-bit floating-point vectors of [2 x double] from /// unaligned memory locations and constructs a 256-bit floating-point vector /// of [4 x double] by concatenating the two 128-bit vectors. /// @@ -4915,7 +4815,7 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1); } -/// \brief Loads two 128-bit integer vectors from unaligned memory locations and +/// Loads two 128-bit integer vectors from unaligned memory locations and /// constructs a 256-bit integer vector by concatenating the two 128-bit /// vectors. /// @@ -4941,7 +4841,7 @@ _mm256_loadu2_m128i(__m128i const *__addr_hi, __m128i const *__addr_lo) } /* SIMD store ops (unaligned) */ -/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point +/// Stores the upper and lower 128 bits of a 256-bit floating-point /// vector of [8 x float] into two different unaligned memory locations. /// /// \headerfile <x86intrin.h> @@ -4970,7 +4870,7 @@ _mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) _mm_storeu_ps(__addr_hi, __v128); } -/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point +/// Stores the upper and lower 128 bits of a 256-bit floating-point /// vector of [4 x double] into two different unaligned memory locations. /// /// \headerfile <x86intrin.h> @@ -4999,7 +4899,7 @@ _mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) _mm_storeu_pd(__addr_hi, __v128); } -/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into +/// Stores the upper and lower 128 bits of a 256-bit integer vector into /// two different unaligned memory locations. /// /// \headerfile <x86intrin.h> @@ -5028,7 +4928,7 @@ _mm256_storeu2_m128i(__m128i *__addr_hi, __m128i *__addr_lo, __m256i __a) _mm_storeu_si128(__addr_hi, __v128); } -/// \brief Constructs a 256-bit floating-point vector of [8 x float] by +/// Constructs a 256-bit floating-point vector of [8 x float] by /// concatenating two 128-bit floating-point vectors of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -5049,7 +4949,7 @@ _mm256_set_m128 (__m128 __hi, __m128 __lo) return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); } -/// \brief Constructs a 256-bit floating-point vector of [4 x double] by +/// Constructs a 256-bit floating-point vector of [4 x double] by /// concatenating two 128-bit floating-point vectors of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -5067,10 +4967,10 @@ _mm256_set_m128 (__m128 __hi, __m128 __lo) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_set_m128d (__m128d __hi, __m128d __lo) { - return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); + return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); } -/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit +/// Constructs a 256-bit integer vector by concatenating two 128-bit /// integer vectors. /// /// \headerfile <x86intrin.h> @@ -5087,10 +4987,10 @@ _mm256_set_m128d (__m128d __hi, __m128d __lo) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_set_m128i (__m128i __hi, __m128i __lo) { - return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); + return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); } -/// \brief Constructs a 256-bit floating-point vector of [8 x float] by +/// Constructs a 256-bit floating-point vector of [8 x float] by /// concatenating two 128-bit floating-point vectors of [4 x float]. This is /// similar to _mm256_set_m128, but the order of the input parameters is /// swapped. @@ -5113,7 +5013,7 @@ _mm256_setr_m128 (__m128 __lo, __m128 __hi) return _mm256_set_m128(__hi, __lo); } -/// \brief Constructs a 256-bit floating-point vector of [4 x double] by +/// Constructs a 256-bit floating-point vector of [4 x double] by /// concatenating two 128-bit floating-point vectors of [2 x double]. This is /// similar to _mm256_set_m128d, but the order of the input parameters is /// swapped. @@ -5133,10 +5033,10 @@ _mm256_setr_m128 (__m128 __lo, __m128 __hi) static __inline __m256d __DEFAULT_FN_ATTRS _mm256_setr_m128d (__m128d __lo, __m128d __hi) { - return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo); + return (__m256d)_mm256_set_m128d(__hi, __lo); } -/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit +/// Constructs a 256-bit integer vector by concatenating two 128-bit /// integer vectors. This is similar to _mm256_set_m128i, but the order of /// the input parameters is swapped. /// @@ -5154,9 +5054,10 @@ _mm256_setr_m128d (__m128d __lo, __m128d __hi) static __inline __m256i __DEFAULT_FN_ATTRS _mm256_setr_m128i (__m128i __lo, __m128i __hi) { - return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo); + return (__m256i)_mm256_set_m128i(__hi, __lo); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 #endif /* __AVXINTRIN_H */ diff --git a/c_headers/bmiintrin.h b/c_headers/bmiintrin.h index e812a1632b..d03bef442a 100644 --- a/c_headers/bmiintrin.h +++ b/c_headers/bmiintrin.h @@ -49,7 +49,7 @@ to use it as a potentially faster version of BSF. */ #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) -/// \brief Counts the number of trailing zero bits in the operand. +/// Counts the number of trailing zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -65,7 +65,7 @@ __tzcnt_u16(unsigned short __X) return __X ? __builtin_ctzs(__X) : 16; } -/// \brief Performs a bitwise AND of the second operand with the one's +/// Performs a bitwise AND of the second operand with the one's /// complement of the first operand. /// /// \headerfile <x86intrin.h> @@ -85,7 +85,7 @@ __andn_u32(unsigned int __X, unsigned int __Y) } /* AMD-specified, double-leading-underscore version of BEXTR */ -/// \brief Extracts the specified bits from the first operand and returns them +/// Extracts the specified bits from the first operand and returns them /// in the least significant bits of the result. /// /// \headerfile <x86intrin.h> @@ -100,6 +100,7 @@ __andn_u32(unsigned int __X, unsigned int __Y) /// number of bits to be extracted. /// \returns An unsigned integer whose least significant bits contain the /// extracted bits. +/// \see _bextr_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS __bextr_u32(unsigned int __X, unsigned int __Y) { @@ -107,7 +108,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y) } /* Intel-specified, single-leading-underscore version of BEXTR */ -/// \brief Extracts the specified bits from the first operand and returns them +/// Extracts the specified bits from the first operand and returns them /// in the least significant bits of the result. /// /// \headerfile <x86intrin.h> @@ -124,13 +125,14 @@ __bextr_u32(unsigned int __X, unsigned int __Y) /// Bits [7:0] specify the number of bits. /// \returns An unsigned integer whose least significant bits contain the /// extracted bits. +/// \see __bextr_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) { return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } -/// \brief Clears all bits in the source except for the least significant bit +/// Clears all bits in the source except for the least significant bit /// containing a value of 1 and returns the result. /// /// \headerfile <x86intrin.h> @@ -147,7 +149,7 @@ __blsi_u32(unsigned int __X) return __X & -__X; } -/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and +/// Creates a mask whose bits are set to 1, using bit 0 up to and /// including the least significant bit that is set to 1 in the source /// operand and returns the result. /// @@ -164,7 +166,7 @@ __blsmsk_u32(unsigned int __X) return __X ^ (__X - 1); } -/// \brief Clears the least significant bit that is set to 1 in the source +/// Clears the least significant bit that is set to 1 in the source /// operand and returns the result. /// /// \headerfile <x86intrin.h> @@ -181,7 +183,7 @@ __blsr_u32(unsigned int __X) return __X & (__X - 1); } -/// \brief Counts the number of trailing zero bits in the operand. +/// Counts the number of trailing zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -197,7 +199,7 @@ __tzcnt_u32(unsigned int __X) return __X ? __builtin_ctz(__X) : 32; } -/// \brief Counts the number of trailing zero bits in the operand. +/// Counts the number of trailing zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -226,7 +228,7 @@ _mm_tzcnt_32(unsigned int __X) #define _tzcnt_u64(a) (__tzcnt_u64((a))) -/// \brief Performs a bitwise AND of the second operand with the one's +/// Performs a bitwise AND of the second operand with the one's /// complement of the first operand. /// /// \headerfile <x86intrin.h> @@ -246,7 +248,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y) } /* AMD-specified, double-leading-underscore version of BEXTR */ -/// \brief Extracts the specified bits from the first operand and returns them +/// Extracts the specified bits from the first operand and returns them /// in the least significant bits of the result. /// /// \headerfile <x86intrin.h> @@ -261,6 +263,7 @@ __andn_u64 (unsigned long long __X, unsigned long long __Y) /// the number of bits to be extracted. /// \returns An unsigned 64-bit integer whose least significant bits contain the /// extracted bits. +/// \see _bextr_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS __bextr_u64(unsigned long long __X, unsigned long long __Y) { @@ -268,7 +271,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y) } /* Intel-specified, single-leading-underscore version of BEXTR */ -/// \brief Extracts the specified bits from the first operand and returns them +/// Extracts the specified bits from the first operand and returns them /// in the least significant bits of the result. /// /// \headerfile <x86intrin.h> @@ -285,13 +288,14 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y) /// Bits [7:0] specify the number of bits. /// \returns An unsigned 64-bit integer whose least significant bits contain the /// extracted bits. +/// \see __bextr_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) { return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); } -/// \brief Clears all bits in the source except for the least significant bit +/// Clears all bits in the source except for the least significant bit /// containing a value of 1 and returns the result. /// /// \headerfile <x86intrin.h> @@ -308,7 +312,7 @@ __blsi_u64(unsigned long long __X) return __X & -__X; } -/// \brief Creates a mask whose bits are set to 1, using bit 0 up to and +/// Creates a mask whose bits are set to 1, using bit 0 up to and /// including the least significant bit that is set to 1 in the source /// operand and returns the result. /// @@ -325,7 +329,7 @@ __blsmsk_u64(unsigned long long __X) return __X ^ (__X - 1); } -/// \brief Clears the least significant bit that is set to 1 in the source +/// Clears the least significant bit that is set to 1 in the source /// operand and returns the result. /// /// \headerfile <x86intrin.h> @@ -342,7 +346,7 @@ __blsr_u64(unsigned long long __X) return __X & (__X - 1); } -/// \brief Counts the number of trailing zero bits in the operand. +/// Counts the number of trailing zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -358,7 +362,7 @@ __tzcnt_u64(unsigned long long __X) return __X ? __builtin_ctzll(__X) : 64; } -/// \brief Counts the number of trailing zero bits in the operand. +/// Counts the number of trailing zero bits in the operand. /// /// \headerfile <x86intrin.h> /// diff --git a/c_headers/cetintrin.h b/c_headers/cetintrin.h index 1256a3f63a..120c95424d 100644 --- a/c_headers/cetintrin.h +++ b/c_headers/cetintrin.h @@ -1,4 +1,4 @@ -/*===---- cetintrin.h - CET intrinsic ------------------------------------=== +/*===---- cetintrin.h - CET intrinsic --------------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,6 +42,16 @@ static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) { } #endif /* __x86_64__ */ +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) { + __builtin_ia32_incsspq(__a); +} +#else /* __x86_64__ */ +static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) { + __builtin_ia32_incsspd((int)__a); +} +#endif /* __x86_64__ */ + static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) { return __builtin_ia32_rdsspd(__a); } @@ -52,6 +62,16 @@ static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long lo } #endif /* __x86_64__ */ +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) { + return __builtin_ia32_rdsspq(0); +} +#else /* __x86_64__ */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) { + return __builtin_ia32_rdsspd(0); +} +#endif /* __x86_64__ */ + static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() { __builtin_ia32_saveprevssp(); } diff --git a/c_headers/cldemoteintrin.h b/c_headers/cldemoteintrin.h new file mode 100644 index 0000000000..fa78148ebf --- /dev/null +++ b/c_headers/cldemoteintrin.h @@ -0,0 +1,42 @@ +/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __CLDEMOTEINTRIN_H +#define __CLDEMOTEINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("cldemote"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_cldemote(const void * __P) { + __builtin_ia32_cldemote(__P); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/clflushoptintrin.h b/c_headers/clflushoptintrin.h index f1f1330234..79bb4589fc 100644 --- a/c_headers/clflushoptintrin.h +++ b/c_headers/clflushoptintrin.h @@ -1,4 +1,4 @@ -/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------------------=== +/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/c_headers/clwbintrin.h b/c_headers/clwbintrin.h index 2594a6c387..c09286ba67 100644 --- a/c_headers/clwbintrin.h +++ b/c_headers/clwbintrin.h @@ -31,7 +31,7 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb"))) -/// \brief Writes back to memory the cache line (if modified) that contains the +/// Writes back to memory the cache line (if modified) that contains the /// linear address specified in \a __p from any level of the cache hierarchy in /// the cache coherence domain /// diff --git a/c_headers/clzerointrin.h b/c_headers/clzerointrin.h index ed7478ff87..f4e920839b 100644 --- a/c_headers/clzerointrin.h +++ b/c_headers/clzerointrin.h @@ -20,18 +20,18 @@ * *===-----------------------------------------------------------------------=== */ -#ifndef __X86INTRIN_H +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H #error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead." #endif -#ifndef _CLZEROINTRIN_H -#define _CLZEROINTRIN_H +#ifndef __CLZEROINTRIN_H +#define __CLZEROINTRIN_H /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("clzero"))) -/// \brief Loads the cache line address and zero's out the cacheline +/// Loads the cache line address and zero's out the cacheline /// /// \headerfile <clzerointrin.h> /// @@ -45,6 +45,6 @@ _mm_clzero (void * __line) __builtin_ia32_clzero ((void *)__line); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS -#endif /* _CLZEROINTRIN_H */ +#endif /* __CLZEROINTRIN_H */ diff --git a/c_headers/cpuid.h b/c_headers/cpuid.h index 3ae90de0b9..fce6af52dd 100644 --- a/c_headers/cpuid.h +++ b/c_headers/cpuid.h @@ -156,6 +156,7 @@ #define bit_SMEP 0x00000080 #define bit_BMI2 0x00000100 #define bit_ENH_MOVSB 0x00000200 +#define bit_INVPCID 0x00000400 #define bit_RTM 0x00000800 #define bit_MPX 0x00004000 #define bit_AVX512F 0x00010000 @@ -166,7 +167,7 @@ #define bit_CLFLUSHOPT 0x00800000 #define bit_CLWB 0x01000000 #define bit_AVX512PF 0x04000000 -#define bit_AVX51SER 0x08000000 +#define bit_AVX512ER 0x08000000 #define bit_AVX512CD 0x10000000 #define bit_SHA 0x20000000 #define bit_AVX512BW 0x40000000 @@ -177,6 +178,7 @@ #define bit_AVX512VBMI 0x00000002 #define bit_PKU 0x00000004 #define bit_OSPKE 0x00000010 +#define bit_WAITPKG 0x00000020 #define bit_AVX512VBMI2 0x00000040 #define bit_SHSTK 0x00000080 #define bit_GFNI 0x00000100 @@ -186,10 +188,14 @@ #define bit_AVX512BITALG 0x00001000 #define bit_AVX512VPOPCNTDQ 0x00004000 #define bit_RDPID 0x00400000 +#define bit_CLDEMOTE 0x02000000 +#define bit_MOVDIRI 0x08000000 +#define bit_MOVDIR64B 0x10000000 /* Features in %edx for leaf 7 sub-leaf 0 */ #define bit_AVX5124VNNIW 0x00000004 #define bit_AVX5124FMAPS 0x00000008 +#define bit_PCONFIG 0x00040000 #define bit_IBT 0x00100000 /* Features in %eax for leaf 13 sub-leaf 1 */ @@ -197,6 +203,9 @@ #define bit_XSAVEC 0x00000002 #define bit_XSAVES 0x00000008 +/* Features in %eax for leaf 0x14 sub-leaf 0 */ +#define bit_PTWRITE 0x00000010 + /* Features in %ecx for leaf 0x80000001 */ #define bit_LAHF_LM 0x00000001 #define bit_ABM 0x00000020 @@ -215,8 +224,9 @@ #define bit_3DNOWP 0x40000000 #define bit_3DNOW 0x80000000 -/* Features in %ebx for leaf 0x80000001 */ +/* Features in %ebx for leaf 0x80000008 */ #define bit_CLZERO 0x00000001 +#define bit_WBNOINVD 0x00000200 #if __i386__ diff --git a/c_headers/cuda_wrappers/algorithm b/c_headers/cuda_wrappers/algorithm index cedd70762c..01af18360d 100644 --- a/c_headers/cuda_wrappers/algorithm +++ b/c_headers/cuda_wrappers/algorithm @@ -24,28 +24,36 @@ #ifndef __CLANG_CUDA_WRAPPERS_ALGORITHM #define __CLANG_CUDA_WRAPPERS_ALGORITHM -// This header defines __device__ overloads of std::min/max, but only if we're -// <= C++11. In C++14, these functions are constexpr, and so are implicitly -// __host__ __device__. +// This header defines __device__ overloads of std::min/max. // -// We don't support the initializer_list overloads because -// initializer_list::begin() and end() are not __host__ __device__ functions. +// Ideally we'd declare these functions only if we're <= C++11. In C++14, +// these functions are constexpr, and so are implicitly __host__ __device__. // -// When compiling in C++14 mode, we could force std::min/max to have different -// implementations for host and device, by declaring the device overloads -// before the constexpr overloads appear. We choose not to do this because - -// a) why write our own implementation when we can use one from the standard -// library? and -// b) libstdc++ is evil and declares min/max inside a header that is included -// *before* we include <algorithm>. So we'd have to unconditionally -// declare our __device__ overloads of min/max, but that would pollute -// things for people who choose not to include <algorithm>. +// However, the compiler being in C++14 mode does not imply that the standard +// library supports C++14. There is no macro we can test to check that the +// stdlib has constexpr std::min/max. Thus we have to unconditionally define +// our device overloads. +// +// A host+device function cannot be overloaded, and a constexpr function +// implicitly become host device if there's no explicitly host or device +// overload preceding it. So the simple thing to do would be to declare our +// device min/max overloads, and then #include_next <algorithm>. This way our +// device overloads would come first, and so if we have a C++14 stdlib, its +// min/max won't become host+device and conflict with our device overloads. +// +// But that also doesn't work. libstdc++ is evil and declares std::min/max in +// an internal header that is included *before* <algorithm>. Thus by the time +// we're inside of this file, std::min/max may already have been declared, and +// thus we can't prevent them from becoming host+device if they're constexpr. +// +// Therefore we perpetrate the following hack: We mark our __device__ overloads +// with __attribute__((enable_if(true, ""))). This causes the signature of the +// function to change without changing anything else about it. (Except that +// overload resolution will prefer it over the __host__ __device__ version +// rather than considering them equally good). #include_next <algorithm> -#if __cplusplus <= 201103L - // We need to define these overloads in exactly the namespace our standard // library uses (including the right inline namespace), otherwise they won't be // picked up by other functions in the standard library (e.g. functions in @@ -59,30 +67,43 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION #endif #endif +#pragma push_macro("_CPP14_CONSTEXPR") +#if __cplusplus >= 201402L +#define _CPP14_CONSTEXPR constexpr +#else +#define _CPP14_CONSTEXPR +#endif + template <class __T, class __Cmp> -inline __device__ const __T & +__attribute__((enable_if(true, ""))) +inline _CPP14_CONSTEXPR __host__ __device__ const __T & max(const __T &__a, const __T &__b, __Cmp __cmp) { return __cmp(__a, __b) ? __b : __a; } template <class __T> -inline __device__ const __T & +__attribute__((enable_if(true, ""))) +inline _CPP14_CONSTEXPR __host__ __device__ const __T & max(const __T &__a, const __T &__b) { return __a < __b ? __b : __a; } template <class __T, class __Cmp> -inline __device__ const __T & +__attribute__((enable_if(true, ""))) +inline _CPP14_CONSTEXPR __host__ __device__ const __T & min(const __T &__a, const __T &__b, __Cmp __cmp) { return __cmp(__b, __a) ? __b : __a; } template <class __T> -inline __device__ const __T & +__attribute__((enable_if(true, ""))) +inline _CPP14_CONSTEXPR __host__ __device__ const __T & min(const __T &__a, const __T &__b) { return __a < __b ? __a : __b; } +#pragma pop_macro("_CPP14_CONSTEXPR") + #ifdef _LIBCPP_END_NAMESPACE_STD _LIBCPP_END_NAMESPACE_STD #else @@ -92,5 +113,4 @@ _GLIBCXX_END_NAMESPACE_VERSION } // namespace std #endif -#endif // __cplusplus <= 201103L #endif // __CLANG_CUDA_WRAPPERS_ALGORITHM diff --git a/c_headers/emmintrin.h b/c_headers/emmintrin.h index b332eeec20..f0ea7cd05c 100644 --- a/c_headers/emmintrin.h +++ b/c_headers/emmintrin.h @@ -44,12 +44,11 @@ typedef unsigned char __v16qu __attribute__((__vector_size__(16))); * appear in the interface though. */ typedef signed char __v16qs __attribute__((__vector_size__(16))); -#include <f16cintrin.h> - /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) -/// \brief Adds lower double-precision values in both operands and returns the +/// Adds lower double-precision values in both operands and returns the /// sum in the lower 64 bits of the result. The upper 64 bits of the result /// are copied from the upper double-precision value of the first operand. /// @@ -71,7 +70,7 @@ _mm_add_sd(__m128d __a, __m128d __b) return __a; } -/// \brief Adds two 128-bit vectors of [2 x double]. +/// Adds two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> /// @@ -89,7 +88,7 @@ _mm_add_pd(__m128d __a, __m128d __b) return (__m128d)((__v2df)__a + (__v2df)__b); } -/// \brief Subtracts the lower double-precision value of the second operand +/// Subtracts the lower double-precision value of the second operand /// from the lower double-precision value of the first operand and returns /// the difference in the lower 64 bits of the result. The upper 64 bits of /// the result are copied from the upper double-precision value of the first @@ -113,7 +112,7 @@ _mm_sub_sd(__m128d __a, __m128d __b) return __a; } -/// \brief Subtracts two 128-bit vectors of [2 x double]. +/// Subtracts two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> /// @@ -131,7 +130,7 @@ _mm_sub_pd(__m128d __a, __m128d __b) return (__m128d)((__v2df)__a - (__v2df)__b); } -/// \brief Multiplies lower double-precision values in both operands and returns +/// Multiplies lower double-precision values in both operands and returns /// the product in the lower 64 bits of the result. The upper 64 bits of the /// result are copied from the upper double-precision value of the first /// operand. @@ -154,7 +153,7 @@ _mm_mul_sd(__m128d __a, __m128d __b) return __a; } -/// \brief Multiplies two 128-bit vectors of [2 x double]. +/// Multiplies two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> /// @@ -172,7 +171,7 @@ _mm_mul_pd(__m128d __a, __m128d __b) return (__m128d)((__v2df)__a * (__v2df)__b); } -/// \brief Divides the lower double-precision value of the first operand by the +/// Divides the lower double-precision value of the first operand by the /// lower double-precision value of the second operand and returns the /// quotient in the lower 64 bits of the result. The upper 64 bits of the /// result are copied from the upper double-precision value of the first @@ -196,7 +195,7 @@ _mm_div_sd(__m128d __a, __m128d __b) return __a; } -/// \brief Performs an element-by-element division of two 128-bit vectors of +/// Performs an element-by-element division of two 128-bit vectors of /// [2 x double]. /// /// \headerfile <x86intrin.h> @@ -215,7 +214,7 @@ _mm_div_pd(__m128d __a, __m128d __b) return (__m128d)((__v2df)__a / (__v2df)__b); } -/// \brief Calculates the square root of the lower double-precision value of +/// Calculates the square root of the lower double-precision value of /// the second operand and returns it in the lower 64 bits of the result. /// The upper 64 bits of the result are copied from the upper /// double-precision value of the first operand. @@ -238,10 +237,10 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, __m128d __b) { __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); - return (__m128d) { __c[0], __a[1] }; + return __extension__ (__m128d) { __c[0], __a[1] }; } -/// \brief Calculates the square root of the each of two values stored in a +/// Calculates the square root of the each of two values stored in a /// 128-bit vector of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -258,7 +257,7 @@ _mm_sqrt_pd(__m128d __a) return __builtin_ia32_sqrtpd((__v2df)__a); } -/// \brief Compares lower 64-bit double-precision values of both operands, and +/// Compares lower 64-bit double-precision values of both operands, and /// returns the lesser of the pair of values in the lower 64-bits of the /// result. The upper 64 bits of the result are copied from the upper /// double-precision value of the first operand. @@ -282,7 +281,7 @@ _mm_min_sd(__m128d __a, __m128d __b) return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); } -/// \brief Performs element-by-element comparison of the two 128-bit vectors of +/// Performs element-by-element comparison of the two 128-bit vectors of /// [2 x double] and returns the vector containing the lesser of each pair of /// values. /// @@ -302,7 +301,7 @@ _mm_min_pd(__m128d __a, __m128d __b) return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares lower 64-bit double-precision values of both operands, and +/// Compares lower 64-bit double-precision values of both operands, and /// returns the greater of the pair of values in the lower 64-bits of the /// result. The upper 64 bits of the result are copied from the upper /// double-precision value of the first operand. @@ -326,7 +325,7 @@ _mm_max_sd(__m128d __a, __m128d __b) return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); } -/// \brief Performs element-by-element comparison of the two 128-bit vectors of +/// Performs element-by-element comparison of the two 128-bit vectors of /// [2 x double] and returns the vector containing the greater of each pair /// of values. /// @@ -346,7 +345,7 @@ _mm_max_pd(__m128d __a, __m128d __b) return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); } -/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double]. +/// Performs a bitwise AND of two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> /// @@ -364,7 +363,7 @@ _mm_and_pd(__m128d __a, __m128d __b) return (__m128d)((__v2du)__a & (__v2du)__b); } -/// \brief Performs a bitwise AND of two 128-bit vectors of [2 x double], using +/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using /// the one's complement of the values contained in the first source operand. /// /// \headerfile <x86intrin.h> @@ -385,7 +384,7 @@ _mm_andnot_pd(__m128d __a, __m128d __b) return (__m128d)(~(__v2du)__a & (__v2du)__b); } -/// \brief Performs a bitwise OR of two 128-bit vectors of [2 x double]. +/// Performs a bitwise OR of two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> /// @@ -403,7 +402,7 @@ _mm_or_pd(__m128d __a, __m128d __b) return (__m128d)((__v2du)__a | (__v2du)__b); } -/// \brief Performs a bitwise XOR of two 128-bit vectors of [2 x double]. +/// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> /// @@ -421,9 +420,9 @@ _mm_xor_pd(__m128d __a, __m128d __b) return (__m128d)((__v2du)__a ^ (__v2du)__b); } -/// \brief Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0h -/// for false, FFFFFFFFFFFFFFFFh for true. +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 +/// for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -440,10 +439,10 @@ _mm_cmpeq_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are less than those in the second operand. Each comparison -/// yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -460,11 +459,11 @@ _mm_cmplt_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are less than or equal to those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -481,11 +480,11 @@ _mm_cmple_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are greater than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -502,11 +501,11 @@ _mm_cmpgt_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are greater than or equal to those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -523,13 +522,13 @@ _mm_cmpge_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are ordered with respect to those in the second operand. /// /// A pair of double-precision values are "ordered" with respect to each -/// other if neither value is a NaN. Each comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// other if neither value is a NaN. Each comparison yields 0x0 for false, +/// 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -546,13 +545,13 @@ _mm_cmpord_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are unordered with respect to those in the second operand. /// /// A pair of double-precision values are "unordered" with respect to each -/// other if one or both values are NaN. Each comparison yields 0h for false, -/// FFFFFFFFFFFFFFFFh for true. +/// other if one or both values are NaN. Each comparison yields 0x0 for +/// false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -570,11 +569,11 @@ _mm_cmpunord_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are unequal to those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -591,11 +590,11 @@ _mm_cmpneq_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are not less than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -612,11 +611,11 @@ _mm_cmpnlt_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are not less than or equal to those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -633,11 +632,11 @@ _mm_cmpnle_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are not greater than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -654,11 +653,11 @@ _mm_cmpngt_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); } -/// \brief Compares each of the corresponding double-precision values of the +/// Compares each of the corresponding double-precision values of the /// 128-bit vectors of [2 x double] to determine if the values in the first /// operand are not greater than or equal to those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -675,10 +674,10 @@ _mm_cmpnge_pd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] for equality. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -698,12 +697,12 @@ _mm_cmpeq_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than the corresponding value in /// the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -723,12 +722,12 @@ _mm_cmplt_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -748,12 +747,12 @@ _mm_cmple_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than the corresponding value /// in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -771,15 +770,15 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, __m128d __b) { __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); - return (__m128d) { __c[0], __a[1] }; + return __extension__ (__m128d) { __c[0], __a[1] }; } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -797,16 +796,16 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, __m128d __b) { __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); - return (__m128d) { __c[0], __a[1] }; + return __extension__ (__m128d) { __c[0], __a[1] }; } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is "ordered" with respect to the /// corresponding value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of -/// double-precision values are "ordered" with respect to each other if +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair +/// of double-precision values are "ordered" with respect to each other if /// neither value is a NaN. /// /// \headerfile <x86intrin.h> @@ -827,14 +826,14 @@ _mm_cmpord_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is "unordered" with respect to the /// corresponding value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. A pair of -/// double-precision values are "unordered" with respect to each other if one -/// or both values are NaN. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair +/// of double-precision values are "unordered" with respect to each other if +/// one or both values are NaN. /// /// \headerfile <x86intrin.h> /// @@ -855,12 +854,12 @@ _mm_cmpunord_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is unequal to the corresponding value in /// the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -880,12 +879,12 @@ _mm_cmpneq_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not less than the corresponding /// value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -905,12 +904,12 @@ _mm_cmpnlt_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not less than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -930,12 +929,12 @@ _mm_cmpnle_sd(__m128d __a, __m128d __b) return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not greater than the corresponding /// value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -953,15 +952,15 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, __m128d __b) { __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); - return (__m128d) { __c[0], __a[1] }; + return __extension__ (__m128d) { __c[0], __a[1] }; } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is not greater than or equal to the /// corresponding value in the second parameter. /// -/// The comparison yields 0h for false, FFFFFFFFFFFFFFFFh for true. +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -979,10 +978,10 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, __m128d __b) { __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); - return (__m128d) { __c[0], __a[1] }; + return __extension__ (__m128d) { __c[0], __a[1] }; } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] for equality. /// /// The comparison yields 0 for false, 1 for true. If either of the two @@ -1006,7 +1005,7 @@ _mm_comieq_sd(__m128d __a, __m128d __b) return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than the corresponding value in /// the second parameter. @@ -1032,7 +1031,7 @@ _mm_comilt_sd(__m128d __a, __m128d __b) return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than or equal to the /// corresponding value in the second parameter. @@ -1058,7 +1057,7 @@ _mm_comile_sd(__m128d __a, __m128d __b) return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than the corresponding value /// in the second parameter. @@ -1084,7 +1083,7 @@ _mm_comigt_sd(__m128d __a, __m128d __b) return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than or equal to the /// corresponding value in the second parameter. @@ -1110,7 +1109,7 @@ _mm_comige_sd(__m128d __a, __m128d __b) return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is unequal to the corresponding value in /// the second parameter. @@ -1136,7 +1135,7 @@ _mm_comineq_sd(__m128d __a, __m128d __b) return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] for equality. The /// comparison yields 0 for false, 1 for true. /// @@ -1160,7 +1159,7 @@ _mm_ucomieq_sd(__m128d __a, __m128d __b) return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than the corresponding value in /// the second parameter. @@ -1186,7 +1185,7 @@ _mm_ucomilt_sd(__m128d __a, __m128d __b) return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is less than or equal to the /// corresponding value in the second parameter. @@ -1212,7 +1211,7 @@ _mm_ucomile_sd(__m128d __a, __m128d __b) return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than the corresponding value /// in the second parameter. @@ -1238,7 +1237,7 @@ _mm_ucomigt_sd(__m128d __a, __m128d __b) return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is greater than or equal to the /// corresponding value in the second parameter. @@ -1264,7 +1263,7 @@ _mm_ucomige_sd(__m128d __a, __m128d __b) return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); } -/// \brief Compares the lower double-precision floating-point values in each of +/// Compares the lower double-precision floating-point values in each of /// the two 128-bit floating-point vectors of [2 x double] to determine if /// the value in the first parameter is unequal to the corresponding value in /// the second parameter. @@ -1290,7 +1289,7 @@ _mm_ucomineq_sd(__m128d __a, __m128d __b) return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); } -/// \brief Converts the two double-precision floating-point elements of a +/// Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two single-precision floating-point /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. /// The upper 64 bits of the result vector are set to zero. @@ -1309,7 +1308,7 @@ _mm_cvtpd_ps(__m128d __a) return __builtin_ia32_cvtpd2ps((__v2df)__a); } -/// \brief Converts the lower two single-precision floating-point elements of a +/// Converts the lower two single-precision floating-point elements of a /// 128-bit vector of [4 x float] into two double-precision floating-point /// values, returned in a 128-bit vector of [2 x double]. The upper two /// elements of the input vector are unused. @@ -1330,7 +1329,7 @@ _mm_cvtps_pd(__m128 __a) __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); } -/// \brief Converts the lower two integer elements of a 128-bit vector of +/// Converts the lower two integer elements of a 128-bit vector of /// [4 x i32] into two double-precision floating-point values, returned in a /// 128-bit vector of [2 x double]. /// @@ -1353,7 +1352,7 @@ _mm_cvtepi32_pd(__m128i __a) __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); } -/// \brief Converts the two double-precision floating-point elements of a +/// Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper /// 64 bits of the result vector are set to zero. @@ -1372,7 +1371,7 @@ _mm_cvtpd_epi32(__m128d __a) return __builtin_ia32_cvtpd2dq((__v2df)__a); } -/// \brief Converts the low-order element of a 128-bit vector of [2 x double] +/// Converts the low-order element of a 128-bit vector of [2 x double] /// into a 32-bit signed integer value. /// /// \headerfile <x86intrin.h> @@ -1389,7 +1388,7 @@ _mm_cvtsd_si32(__m128d __a) return __builtin_ia32_cvtsd2si((__v2df)__a); } -/// \brief Converts the lower double-precision floating-point element of a +/// Converts the lower double-precision floating-point element of a /// 128-bit vector of [2 x double], in the second parameter, into a /// single-precision floating-point value, returned in the lower 32 bits of a /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are @@ -1414,7 +1413,7 @@ _mm_cvtsd_ss(__m128 __a, __m128d __b) return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); } -/// \brief Converts a 32-bit signed integer value, in the second parameter, into +/// Converts a 32-bit signed integer value, in the second parameter, into /// a double-precision floating-point value, returned in the lower 64 bits of /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector /// are copied from the upper 64 bits of the first parameter. @@ -1438,7 +1437,7 @@ _mm_cvtsi32_sd(__m128d __a, int __b) return __a; } -/// \brief Converts the lower single-precision floating-point element of a +/// Converts the lower single-precision floating-point element of a /// 128-bit vector of [4 x float], in the second parameter, into a /// double-precision floating-point value, returned in the lower 64 bits of /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector @@ -1464,7 +1463,7 @@ _mm_cvtss_sd(__m128d __a, __m128 __b) return __a; } -/// \brief Converts the two double-precision floating-point elements of a +/// Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. /// @@ -1487,7 +1486,7 @@ _mm_cvttpd_epi32(__m128d __a) return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); } -/// \brief Converts the low-order element of a [2 x double] vector into a 32-bit +/// Converts the low-order element of a [2 x double] vector into a 32-bit /// signed integer value, truncating the result when it is inexact. /// /// \headerfile <x86intrin.h> @@ -1505,7 +1504,7 @@ _mm_cvttsd_si32(__m128d __a) return __builtin_ia32_cvttsd2si((__v2df)__a); } -/// \brief Converts the two double-precision floating-point elements of a +/// Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, /// returned in a 64-bit vector of [2 x i32]. /// @@ -1516,13 +1515,13 @@ _mm_cvttsd_si32(__m128d __a) /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); } -/// \brief Converts the two double-precision floating-point elements of a +/// Converts the two double-precision floating-point elements of a /// 128-bit vector of [2 x double] into two signed 32-bit integer values, /// returned in a 64-bit vector of [2 x i32]. /// @@ -1536,13 +1535,13 @@ _mm_cvtpd_pi32(__m128d __a) /// \param __a /// A 128-bit vector of [2 x double]. /// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); } -/// \brief Converts the two signed 32-bit integer elements of a 64-bit vector of +/// Converts the two signed 32-bit integer elements of a 64-bit vector of /// [2 x i32] into two double-precision floating-point values, returned in a /// 128-bit vector of [2 x double]. /// @@ -1553,13 +1552,13 @@ _mm_cvttpd_pi32(__m128d __a) /// \param __a /// A 64-bit vector of [2 x i32]. /// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { return __builtin_ia32_cvtpi2pd((__v2si)__a); } -/// \brief Returns the low-order element of a 128-bit vector of [2 x double] as +/// Returns the low-order element of a 128-bit vector of [2 x double] as /// a double-precision floating-point value. /// /// \headerfile <x86intrin.h> @@ -1576,7 +1575,7 @@ _mm_cvtsd_f64(__m128d __a) return __a[0]; } -/// \brief Loads a 128-bit floating-point vector of [2 x double] from an aligned +/// Loads a 128-bit floating-point vector of [2 x double] from an aligned /// memory location. /// /// \headerfile <x86intrin.h> @@ -1593,7 +1592,7 @@ _mm_load_pd(double const *__dp) return *(__m128d*)__dp; } -/// \brief Loads a double-precision floating-point value from a specified memory +/// Loads a double-precision floating-point value from a specified memory /// location and duplicates it to both vector elements of a 128-bit vector of /// [2 x double]. /// @@ -1612,12 +1611,12 @@ _mm_load1_pd(double const *__dp) double __u; } __attribute__((__packed__, __may_alias__)); double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; - return (__m128d){ __u, __u }; + return __extension__ (__m128d){ __u, __u }; } #define _mm_load_pd1(dp) _mm_load1_pd(dp) -/// \brief Loads two double-precision values, in reverse order, from an aligned +/// Loads two double-precision values, in reverse order, from an aligned /// memory location into a 128-bit vector of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -1638,7 +1637,7 @@ _mm_loadr_pd(double const *__dp) return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); } -/// \brief Loads a 128-bit floating-point vector of [2 x double] from an +/// Loads a 128-bit floating-point vector of [2 x double] from an /// unaligned memory location. /// /// \headerfile <x86intrin.h> @@ -1658,7 +1657,7 @@ _mm_loadu_pd(double const *__dp) return ((struct __loadu_pd*)__dp)->__v; } -/// \brief Loads a 64-bit integer value to the low element of a 128-bit integer +/// Loads a 64-bit integer value to the low element of a 128-bit integer /// vector and clears the upper element. /// /// \headerfile <x86intrin.h> @@ -1676,10 +1675,10 @@ _mm_loadu_si64(void const *__a) long long __v; } __attribute__((__packed__, __may_alias__)); long long __u = ((struct __loadu_si64*)__a)->__v; - return (__m128i){__u, 0L}; + return __extension__ (__m128i)(__v2di){__u, 0L}; } -/// \brief Loads a 64-bit double-precision value to the low element of a +/// Loads a 64-bit double-precision value to the low element of a /// 128-bit integer vector and clears the upper element. /// /// \headerfile <x86intrin.h> @@ -1697,10 +1696,10 @@ _mm_load_sd(double const *__dp) double __u; } __attribute__((__packed__, __may_alias__)); double __u = ((struct __mm_load_sd_struct*)__dp)->__u; - return (__m128d){ __u, 0 }; + return __extension__ (__m128d){ __u, 0 }; } -/// \brief Loads a double-precision value into the high-order bits of a 128-bit +/// Loads a double-precision value into the high-order bits of a 128-bit /// vector of [2 x double]. The low-order bits are copied from the low-order /// bits of the first operand. /// @@ -1724,10 +1723,10 @@ _mm_loadh_pd(__m128d __a, double const *__dp) double __u; } __attribute__((__packed__, __may_alias__)); double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; - return (__m128d){ __a[0], __u }; + return __extension__ (__m128d){ __a[0], __u }; } -/// \brief Loads a double-precision value into the low-order bits of a 128-bit +/// Loads a double-precision value into the low-order bits of a 128-bit /// vector of [2 x double]. The high-order bits are copied from the /// high-order bits of the first operand. /// @@ -1751,10 +1750,10 @@ _mm_loadl_pd(__m128d __a, double const *__dp) double __u; } __attribute__((__packed__, __may_alias__)); double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; - return (__m128d){ __u, __a[1] }; + return __extension__ (__m128d){ __u, __a[1] }; } -/// \brief Constructs a 128-bit floating-point vector of [2 x double] with +/// Constructs a 128-bit floating-point vector of [2 x double] with /// unspecified content. This could be used as an argument to another /// intrinsic function where the argument is required but the value is not /// actually used. @@ -1771,7 +1770,7 @@ _mm_undefined_pd(void) return (__m128d)__builtin_ia32_undef128(); } -/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower +/// Constructs a 128-bit floating-point vector of [2 x double]. The lower /// 64 bits of the vector are initialized with the specified double-precision /// floating-point value. The upper 64 bits are set to zero. /// @@ -1788,10 +1787,10 @@ _mm_undefined_pd(void) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { - return (__m128d){ __w, 0 }; + return __extension__ (__m128d){ __w, 0 }; } -/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each +/// Constructs a 128-bit floating-point vector of [2 x double], with each /// of the two double-precision floating-point vector elements set to the /// specified double-precision floating-point value. /// @@ -1806,10 +1805,10 @@ _mm_set_sd(double __w) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { - return (__m128d){ __w, __w }; + return __extension__ (__m128d){ __w, __w }; } -/// \brief Constructs a 128-bit floating-point vector of [2 x double], with each +/// Constructs a 128-bit floating-point vector of [2 x double], with each /// of the two double-precision floating-point vector elements set to the /// specified double-precision floating-point value. /// @@ -1827,7 +1826,7 @@ _mm_set_pd1(double __w) return _mm_set1_pd(__w); } -/// \brief Constructs a 128-bit floating-point vector of [2 x double] +/// Constructs a 128-bit floating-point vector of [2 x double] /// initialized with the specified double-precision floating-point values. /// /// \headerfile <x86intrin.h> @@ -1844,10 +1843,10 @@ _mm_set_pd1(double __w) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, double __x) { - return (__m128d){ __x, __w }; + return __extension__ (__m128d){ __x, __w }; } -/// \brief Constructs a 128-bit floating-point vector of [2 x double], +/// Constructs a 128-bit floating-point vector of [2 x double], /// initialized in reverse order with the specified double-precision /// floating-point values. /// @@ -1865,10 +1864,10 @@ _mm_set_pd(double __w, double __x) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, double __x) { - return (__m128d){ __w, __x }; + return __extension__ (__m128d){ __w, __x }; } -/// \brief Constructs a 128-bit floating-point vector of [2 x double] +/// Constructs a 128-bit floating-point vector of [2 x double] /// initialized to zero. /// /// \headerfile <x86intrin.h> @@ -1880,10 +1879,10 @@ _mm_setr_pd(double __w, double __x) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { - return (__m128d){ 0, 0 }; + return __extension__ (__m128d){ 0, 0 }; } -/// \brief Constructs a 128-bit floating-point vector of [2 x double]. The lower +/// Constructs a 128-bit floating-point vector of [2 x double]. The lower /// 64 bits are set to the lower 64 bits of the second parameter. The upper /// 64 bits are set to the upper 64 bits of the first parameter. /// @@ -1901,10 +1900,11 @@ _mm_setzero_pd(void) static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, __m128d __b) { - return (__m128d){ __b[0], __a[1] }; + __a[0] = __b[0]; + return __a; } -/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a +/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a /// memory location. /// /// \headerfile <x86intrin.h> @@ -1924,7 +1924,7 @@ _mm_store_sd(double *__dp, __m128d __a) ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; } -/// \brief Moves packed double-precision values from a 128-bit vector of +/// Moves packed double-precision values from a 128-bit vector of /// [2 x double] to a memory location. /// /// \headerfile <x86intrin.h> @@ -1943,7 +1943,7 @@ _mm_store_pd(double *__dp, __m128d __a) *(__m128d*)__dp = __a; } -/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to /// the upper and lower 64 bits of a memory location. /// /// \headerfile <x86intrin.h> @@ -1964,7 +1964,7 @@ _mm_store1_pd(double *__dp, __m128d __a) _mm_store_pd(__dp, __a); } -/// \brief Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to /// the upper and lower 64 bits of a memory location. /// /// \headerfile <x86intrin.h> @@ -1981,10 +1981,10 @@ _mm_store1_pd(double *__dp, __m128d __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, __m128d __a) { - return _mm_store1_pd(__dp, __a); + _mm_store1_pd(__dp, __a); } -/// \brief Stores a 128-bit vector of [2 x double] into an unaligned memory +/// Stores a 128-bit vector of [2 x double] into an unaligned memory /// location. /// /// \headerfile <x86intrin.h> @@ -2005,7 +2005,7 @@ _mm_storeu_pd(double *__dp, __m128d __a) ((struct __storeu_pd*)__dp)->__v = __a; } -/// \brief Stores two double-precision values, in reverse order, from a 128-bit +/// Stores two double-precision values, in reverse order, from a 128-bit /// vector of [2 x double] to a 16-byte aligned memory location. /// /// \headerfile <x86intrin.h> @@ -2026,7 +2026,7 @@ _mm_storer_pd(double *__dp, __m128d __a) *(__m128d *)__dp = __a; } -/// \brief Stores the upper 64 bits of a 128-bit vector of [2 x double] to a +/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a /// memory location. /// /// \headerfile <x86intrin.h> @@ -2046,7 +2046,7 @@ _mm_storeh_pd(double *__dp, __m128d __a) ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; } -/// \brief Stores the lower 64 bits of a 128-bit vector of [2 x double] to a +/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a /// memory location. /// /// \headerfile <x86intrin.h> @@ -2066,7 +2066,7 @@ _mm_storel_pd(double *__dp, __m128d __a) ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; } -/// \brief Adds the corresponding elements of two 128-bit vectors of [16 x i8], +/// Adds the corresponding elements of two 128-bit vectors of [16 x i8], /// saving the lower 8 bits of each sum in the corresponding element of a /// 128-bit result vector of [16 x i8]. /// @@ -2088,7 +2088,7 @@ _mm_add_epi8(__m128i __a, __m128i __b) return (__m128i)((__v16qu)__a + (__v16qu)__b); } -/// \brief Adds the corresponding elements of two 128-bit vectors of [8 x i16], +/// Adds the corresponding elements of two 128-bit vectors of [8 x i16], /// saving the lower 16 bits of each sum in the corresponding element of a /// 128-bit result vector of [8 x i16]. /// @@ -2110,7 +2110,7 @@ _mm_add_epi16(__m128i __a, __m128i __b) return (__m128i)((__v8hu)__a + (__v8hu)__b); } -/// \brief Adds the corresponding elements of two 128-bit vectors of [4 x i32], +/// Adds the corresponding elements of two 128-bit vectors of [4 x i32], /// saving the lower 32 bits of each sum in the corresponding element of a /// 128-bit result vector of [4 x i32]. /// @@ -2132,7 +2132,7 @@ _mm_add_epi32(__m128i __a, __m128i __b) return (__m128i)((__v4su)__a + (__v4su)__b); } -/// \brief Adds two signed or unsigned 64-bit integer values, returning the +/// Adds two signed or unsigned 64-bit integer values, returning the /// lower 64 bits of the sum. /// /// \headerfile <x86intrin.h> @@ -2144,13 +2144,13 @@ _mm_add_epi32(__m128i __a, __m128i __b) /// \param __b /// A 64-bit integer. /// \returns A 64-bit integer containing the sum of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); } -/// \brief Adds the corresponding elements of two 128-bit vectors of [2 x i64], +/// Adds the corresponding elements of two 128-bit vectors of [2 x i64], /// saving the lower 64 bits of each sum in the corresponding element of a /// 128-bit result vector of [2 x i64]. /// @@ -2172,10 +2172,10 @@ _mm_add_epi64(__m128i __a, __m128i __b) return (__m128i)((__v2du)__a + (__v2du)__b); } -/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// Adds, with saturation, the corresponding elements of two 128-bit /// signed [16 x i8] vectors, saving each sum in the corresponding element of -/// a 128-bit result vector of [16 x i8]. Positive sums greater than 7Fh are -/// saturated to 7Fh. Negative sums less than 80h are saturated to 80h. +/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are +/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. /// /// \headerfile <x86intrin.h> /// @@ -2193,11 +2193,11 @@ _mm_adds_epi8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); } -/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// Adds, with saturation, the corresponding elements of two 128-bit /// signed [8 x i16] vectors, saving each sum in the corresponding element of -/// a 128-bit result vector of [8 x i16]. Positive sums greater than 7FFFh -/// are saturated to 7FFFh. Negative sums less than 8000h are saturated to -/// 8000h. +/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF +/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to +/// 0x8000. /// /// \headerfile <x86intrin.h> /// @@ -2215,10 +2215,10 @@ _mm_adds_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// Adds, with saturation, the corresponding elements of two 128-bit /// unsigned [16 x i8] vectors, saving each sum in the corresponding element -/// of a 128-bit result vector of [16 x i8]. Positive sums greater than FFh -/// are saturated to FFh. Negative sums are saturated to 00h. +/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF +/// are saturated to 0xFF. Negative sums are saturated to 0x00. /// /// \headerfile <x86intrin.h> /// @@ -2236,10 +2236,10 @@ _mm_adds_epu8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); } -/// \brief Adds, with saturation, the corresponding elements of two 128-bit +/// Adds, with saturation, the corresponding elements of two 128-bit /// unsigned [8 x i16] vectors, saving each sum in the corresponding element -/// of a 128-bit result vector of [8 x i16]. Positive sums greater than FFFFh -/// are saturated to FFFFh. Negative sums are saturated to 0000h. +/// of a 128-bit result vector of [8 x i16]. Positive sums greater than +/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. /// /// \headerfile <x86intrin.h> /// @@ -2257,7 +2257,7 @@ _mm_adds_epu16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Computes the rounded avarages of corresponding elements of two +/// Computes the rounded avarages of corresponding elements of two /// 128-bit unsigned [16 x i8] vectors, saving each result in the /// corresponding element of a 128-bit result vector of [16 x i8]. /// @@ -2281,7 +2281,7 @@ _mm_avg_epu8(__m128i __a, __m128i __b) >> 1, __v16qu); } -/// \brief Computes the rounded avarages of corresponding elements of two +/// Computes the rounded avarages of corresponding elements of two /// 128-bit unsigned [8 x i16] vectors, saving each result in the /// corresponding element of a 128-bit result vector of [8 x i16]. /// @@ -2305,7 +2305,7 @@ _mm_avg_epu16(__m128i __a, __m128i __b) >> 1, __v8hu); } -/// \brief Multiplies the corresponding elements of two 128-bit signed [8 x i16] +/// Multiplies the corresponding elements of two 128-bit signed [8 x i16] /// vectors, producing eight intermediate 32-bit signed integer products, and /// adds the consecutive pairs of 32-bit products to form a 128-bit signed /// [4 x i32] vector. @@ -2331,7 +2331,7 @@ _mm_madd_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); } -/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] +/// Compares corresponding elements of two 128-bit signed [8 x i16] /// vectors, saving the greater value from each comparison in the /// corresponding element of a 128-bit result vector of [8 x i16]. /// @@ -2351,7 +2351,7 @@ _mm_max_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] +/// Compares corresponding elements of two 128-bit unsigned [16 x i8] /// vectors, saving the greater value from each comparison in the /// corresponding element of a 128-bit result vector of [16 x i8]. /// @@ -2371,7 +2371,7 @@ _mm_max_epu8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); } -/// \brief Compares corresponding elements of two 128-bit signed [8 x i16] +/// Compares corresponding elements of two 128-bit signed [8 x i16] /// vectors, saving the smaller value from each comparison in the /// corresponding element of a 128-bit result vector of [8 x i16]. /// @@ -2391,7 +2391,7 @@ _mm_min_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Compares corresponding elements of two 128-bit unsigned [16 x i8] +/// Compares corresponding elements of two 128-bit unsigned [16 x i8] /// vectors, saving the smaller value from each comparison in the /// corresponding element of a 128-bit result vector of [16 x i8]. /// @@ -2411,7 +2411,7 @@ _mm_min_epu8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); } -/// \brief Multiplies the corresponding elements of two signed [8 x i16] +/// Multiplies the corresponding elements of two signed [8 x i16] /// vectors, saving the upper 16 bits of each 32-bit product in the /// corresponding element of a 128-bit signed [8 x i16] result vector. /// @@ -2431,7 +2431,7 @@ _mm_mulhi_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Multiplies the corresponding elements of two unsigned [8 x i16] +/// Multiplies the corresponding elements of two unsigned [8 x i16] /// vectors, saving the upper 16 bits of each 32-bit product in the /// corresponding element of a 128-bit unsigned [8 x i16] result vector. /// @@ -2451,7 +2451,7 @@ _mm_mulhi_epu16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Multiplies the corresponding elements of two signed [8 x i16] +/// Multiplies the corresponding elements of two signed [8 x i16] /// vectors, saving the lower 16 bits of each 32-bit product in the /// corresponding element of a 128-bit signed [8 x i16] result vector. /// @@ -2471,7 +2471,7 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) return (__m128i)((__v8hu)__a * (__v8hu)__b); } -/// \brief Multiplies 32-bit unsigned integer values contained in the lower bits +/// Multiplies 32-bit unsigned integer values contained in the lower bits /// of the two 64-bit integer vectors and returns the 64-bit unsigned /// product. /// @@ -2484,13 +2484,13 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) /// \param __b /// A 64-bit integer containing one of the source operands. /// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, __m64 __b) { return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); } -/// \brief Multiplies 32-bit unsigned integer values contained in the lower +/// Multiplies 32-bit unsigned integer values contained in the lower /// bits of the corresponding elements of two [2 x i64] vectors, and returns /// the 64-bit products in the corresponding elements of a [2 x i64] vector. /// @@ -2509,7 +2509,7 @@ _mm_mul_epu32(__m128i __a, __m128i __b) return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); } -/// \brief Computes the absolute differences of corresponding 8-bit integer +/// Computes the absolute differences of corresponding 8-bit integer /// values in two 128-bit vectors. Sums the first 8 absolute differences, and /// separately sums the second 8 absolute differences. Packs these two /// unsigned 16-bit integer sums into the upper and lower elements of a @@ -2531,7 +2531,7 @@ _mm_sad_epu8(__m128i __a, __m128i __b) return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); } -/// \brief Subtracts the corresponding 8-bit integer values in the operands. +/// Subtracts the corresponding 8-bit integer values in the operands. /// /// \headerfile <x86intrin.h> /// @@ -2549,7 +2549,7 @@ _mm_sub_epi8(__m128i __a, __m128i __b) return (__m128i)((__v16qu)__a - (__v16qu)__b); } -/// \brief Subtracts the corresponding 16-bit integer values in the operands. +/// Subtracts the corresponding 16-bit integer values in the operands. /// /// \headerfile <x86intrin.h> /// @@ -2567,7 +2567,7 @@ _mm_sub_epi16(__m128i __a, __m128i __b) return (__m128i)((__v8hu)__a - (__v8hu)__b); } -/// \brief Subtracts the corresponding 32-bit integer values in the operands. +/// Subtracts the corresponding 32-bit integer values in the operands. /// /// \headerfile <x86intrin.h> /// @@ -2585,7 +2585,7 @@ _mm_sub_epi32(__m128i __a, __m128i __b) return (__m128i)((__v4su)__a - (__v4su)__b); } -/// \brief Subtracts signed or unsigned 64-bit integer values and writes the +/// Subtracts signed or unsigned 64-bit integer values and writes the /// difference to the corresponding bits in the destination. /// /// \headerfile <x86intrin.h> @@ -2598,13 +2598,13 @@ _mm_sub_epi32(__m128i __a, __m128i __b) /// A 64-bit integer vector containing the subtrahend. /// \returns A 64-bit integer vector containing the difference of the values in /// the operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); } -/// \brief Subtracts the corresponding elements of two [2 x i64] vectors. +/// Subtracts the corresponding elements of two [2 x i64] vectors. /// /// \headerfile <x86intrin.h> /// @@ -2622,10 +2622,10 @@ _mm_sub_epi64(__m128i __a, __m128i __b) return (__m128i)((__v2du)__a - (__v2du)__b); } -/// \brief Subtracts corresponding 8-bit signed integer values in the input and +/// Subtracts corresponding 8-bit signed integer values in the input and /// returns the differences in the corresponding bytes in the destination. -/// Differences greater than 7Fh are saturated to 7Fh, and differences less -/// than 80h are saturated to 80h. +/// Differences greater than 0x7F are saturated to 0x7F, and differences less +/// than 0x80 are saturated to 0x80. /// /// \headerfile <x86intrin.h> /// @@ -2643,10 +2643,10 @@ _mm_subs_epi8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); } -/// \brief Subtracts corresponding 16-bit signed integer values in the input and +/// Subtracts corresponding 16-bit signed integer values in the input and /// returns the differences in the corresponding bytes in the destination. -/// Differences greater than 7FFFh are saturated to 7FFFh, and values less -/// than 8000h are saturated to 8000h. +/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less +/// than 0x8000 are saturated to 0x8000. /// /// \headerfile <x86intrin.h> /// @@ -2664,9 +2664,9 @@ _mm_subs_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Subtracts corresponding 8-bit unsigned integer values in the input +/// Subtracts corresponding 8-bit unsigned integer values in the input /// and returns the differences in the corresponding bytes in the -/// destination. Differences less than 00h are saturated to 00h. +/// destination. Differences less than 0x00 are saturated to 0x00. /// /// \headerfile <x86intrin.h> /// @@ -2684,9 +2684,9 @@ _mm_subs_epu8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); } -/// \brief Subtracts corresponding 16-bit unsigned integer values in the input +/// Subtracts corresponding 16-bit unsigned integer values in the input /// and returns the differences in the corresponding bytes in the -/// destination. Differences less than 0000h are saturated to 0000h. +/// destination. Differences less than 0x0000 are saturated to 0x0000. /// /// \headerfile <x86intrin.h> /// @@ -2704,7 +2704,7 @@ _mm_subs_epu16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Performs a bitwise AND of two 128-bit integer vectors. +/// Performs a bitwise AND of two 128-bit integer vectors. /// /// \headerfile <x86intrin.h> /// @@ -2722,7 +2722,7 @@ _mm_and_si128(__m128i __a, __m128i __b) return (__m128i)((__v2du)__a & (__v2du)__b); } -/// \brief Performs a bitwise AND of two 128-bit integer vectors, using the +/// Performs a bitwise AND of two 128-bit integer vectors, using the /// one's complement of the values contained in the first source operand. /// /// \headerfile <x86intrin.h> @@ -2741,7 +2741,7 @@ _mm_andnot_si128(__m128i __a, __m128i __b) { return (__m128i)(~(__v2du)__a & (__v2du)__b); } -/// \brief Performs a bitwise OR of two 128-bit integer vectors. +/// Performs a bitwise OR of two 128-bit integer vectors. /// /// \headerfile <x86intrin.h> /// @@ -2759,7 +2759,7 @@ _mm_or_si128(__m128i __a, __m128i __b) return (__m128i)((__v2du)__a | (__v2du)__b); } -/// \brief Performs a bitwise exclusive OR of two 128-bit integer vectors. +/// Performs a bitwise exclusive OR of two 128-bit integer vectors. /// /// \headerfile <x86intrin.h> /// @@ -2777,7 +2777,7 @@ _mm_xor_si128(__m128i __a, __m128i __b) return (__m128i)((__v2du)__a ^ (__v2du)__b); } -/// \brief Left-shifts the 128-bit integer vector operand by the specified +/// Left-shifts the 128-bit integer vector operand by the specified /// number of bytes. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2794,31 +2794,13 @@ _mm_xor_si128(__m128i __a, __m128i __b) /// An immediate value specifying the number of bytes to left-shift operand /// \a a. /// \returns A 128-bit integer vector containing the left-shifted value. -#define _mm_slli_si128(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector( \ - (__v16qi)_mm_setzero_si128(), \ - (__v16qi)(__m128i)(a), \ - ((char)(imm)&0xF0) ? 0 : 16 - (char)(imm), \ - ((char)(imm)&0xF0) ? 1 : 17 - (char)(imm), \ - ((char)(imm)&0xF0) ? 2 : 18 - (char)(imm), \ - ((char)(imm)&0xF0) ? 3 : 19 - (char)(imm), \ - ((char)(imm)&0xF0) ? 4 : 20 - (char)(imm), \ - ((char)(imm)&0xF0) ? 5 : 21 - (char)(imm), \ - ((char)(imm)&0xF0) ? 6 : 22 - (char)(imm), \ - ((char)(imm)&0xF0) ? 7 : 23 - (char)(imm), \ - ((char)(imm)&0xF0) ? 8 : 24 - (char)(imm), \ - ((char)(imm)&0xF0) ? 9 : 25 - (char)(imm), \ - ((char)(imm)&0xF0) ? 10 : 26 - (char)(imm), \ - ((char)(imm)&0xF0) ? 11 : 27 - (char)(imm), \ - ((char)(imm)&0xF0) ? 12 : 28 - (char)(imm), \ - ((char)(imm)&0xF0) ? 13 : 29 - (char)(imm), \ - ((char)(imm)&0xF0) ? 14 : 30 - (char)(imm), \ - ((char)(imm)&0xF0) ? 15 : 31 - (char)(imm)); }) +#define _mm_slli_si128(a, imm) \ + (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) #define _mm_bslli_si128(a, imm) \ - _mm_slli_si128((a), (imm)) + (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) -/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand +/// Left-shifts each 16-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2837,7 +2819,7 @@ _mm_slli_epi16(__m128i __a, int __count) return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); } -/// \brief Left-shifts each 16-bit value in the 128-bit integer vector operand +/// Left-shifts each 16-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2856,7 +2838,7 @@ _mm_sll_epi16(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); } -/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand +/// Left-shifts each 32-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2875,7 +2857,7 @@ _mm_slli_epi32(__m128i __a, int __count) return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); } -/// \brief Left-shifts each 32-bit value in the 128-bit integer vector operand +/// Left-shifts each 32-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2894,7 +2876,7 @@ _mm_sll_epi32(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); } -/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand +/// Left-shifts each 64-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2913,7 +2895,7 @@ _mm_slli_epi64(__m128i __a, int __count) return __builtin_ia32_psllqi128((__v2di)__a, __count); } -/// \brief Left-shifts each 64-bit value in the 128-bit integer vector operand +/// Left-shifts each 64-bit value in the 128-bit integer vector operand /// by the specified number of bits. Low-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -2932,7 +2914,7 @@ _mm_sll_epi64(__m128i __a, __m128i __count) return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); } -/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand +/// Right-shifts each 16-bit value in the 128-bit integer vector operand /// by the specified number of bits. High-order bits are filled with the sign /// bit of the initial value. /// @@ -2952,7 +2934,7 @@ _mm_srai_epi16(__m128i __a, int __count) return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); } -/// \brief Right-shifts each 16-bit value in the 128-bit integer vector operand +/// Right-shifts each 16-bit value in the 128-bit integer vector operand /// by the specified number of bits. High-order bits are filled with the sign /// bit of the initial value. /// @@ -2972,7 +2954,7 @@ _mm_sra_epi16(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); } -/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand +/// Right-shifts each 32-bit value in the 128-bit integer vector operand /// by the specified number of bits. High-order bits are filled with the sign /// bit of the initial value. /// @@ -2992,7 +2974,7 @@ _mm_srai_epi32(__m128i __a, int __count) return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); } -/// \brief Right-shifts each 32-bit value in the 128-bit integer vector operand +/// Right-shifts each 32-bit value in the 128-bit integer vector operand /// by the specified number of bits. High-order bits are filled with the sign /// bit of the initial value. /// @@ -3012,7 +2994,7 @@ _mm_sra_epi32(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); } -/// \brief Right-shifts the 128-bit integer vector operand by the specified +/// Right-shifts the 128-bit integer vector operand by the specified /// number of bytes. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3029,31 +3011,13 @@ _mm_sra_epi32(__m128i __a, __m128i __count) /// An immediate value specifying the number of bytes to right-shift operand /// \a a. /// \returns A 128-bit integer vector containing the right-shifted value. -#define _mm_srli_si128(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector( \ - (__v16qi)(__m128i)(a), \ - (__v16qi)_mm_setzero_si128(), \ - ((char)(imm)&0xF0) ? 16 : (char)(imm) + 0, \ - ((char)(imm)&0xF0) ? 17 : (char)(imm) + 1, \ - ((char)(imm)&0xF0) ? 18 : (char)(imm) + 2, \ - ((char)(imm)&0xF0) ? 19 : (char)(imm) + 3, \ - ((char)(imm)&0xF0) ? 20 : (char)(imm) + 4, \ - ((char)(imm)&0xF0) ? 21 : (char)(imm) + 5, \ - ((char)(imm)&0xF0) ? 22 : (char)(imm) + 6, \ - ((char)(imm)&0xF0) ? 23 : (char)(imm) + 7, \ - ((char)(imm)&0xF0) ? 24 : (char)(imm) + 8, \ - ((char)(imm)&0xF0) ? 25 : (char)(imm) + 9, \ - ((char)(imm)&0xF0) ? 26 : (char)(imm) + 10, \ - ((char)(imm)&0xF0) ? 27 : (char)(imm) + 11, \ - ((char)(imm)&0xF0) ? 28 : (char)(imm) + 12, \ - ((char)(imm)&0xF0) ? 29 : (char)(imm) + 13, \ - ((char)(imm)&0xF0) ? 30 : (char)(imm) + 14, \ - ((char)(imm)&0xF0) ? 31 : (char)(imm) + 15); }) +#define _mm_srli_si128(a, imm) \ + (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) #define _mm_bsrli_si128(a, imm) \ - _mm_srli_si128((a), (imm)) + (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) -/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector +/// Right-shifts each of 16-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3072,7 +3036,7 @@ _mm_srli_epi16(__m128i __a, int __count) return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); } -/// \brief Right-shifts each of 16-bit values in the 128-bit integer vector +/// Right-shifts each of 16-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3091,7 +3055,7 @@ _mm_srl_epi16(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); } -/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector +/// Right-shifts each of 32-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3110,7 +3074,7 @@ _mm_srli_epi32(__m128i __a, int __count) return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); } -/// \brief Right-shifts each of 32-bit values in the 128-bit integer vector +/// Right-shifts each of 32-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3129,7 +3093,7 @@ _mm_srl_epi32(__m128i __a, __m128i __count) return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); } -/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector +/// Right-shifts each of 64-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3148,7 +3112,7 @@ _mm_srli_epi64(__m128i __a, int __count) return __builtin_ia32_psrlqi128((__v2di)__a, __count); } -/// \brief Right-shifts each of 64-bit values in the 128-bit integer vector +/// Right-shifts each of 64-bit values in the 128-bit integer vector /// operand by the specified number of bits. High-order bits are cleared. /// /// \headerfile <x86intrin.h> @@ -3167,8 +3131,8 @@ _mm_srl_epi64(__m128i __a, __m128i __count) return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); } -/// \brief Compares each of the corresponding 8-bit values of the 128-bit -/// integer vectors for equality. Each comparison yields 0h for false, FFh +/// Compares each of the corresponding 8-bit values of the 128-bit +/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF /// for true. /// /// \headerfile <x86intrin.h> @@ -3186,9 +3150,9 @@ _mm_cmpeq_epi8(__m128i __a, __m128i __b) return (__m128i)((__v16qi)__a == (__v16qi)__b); } -/// \brief Compares each of the corresponding 16-bit values of the 128-bit -/// integer vectors for equality. Each comparison yields 0h for false, FFFFh -/// for true. +/// Compares each of the corresponding 16-bit values of the 128-bit +/// integer vectors for equality. Each comparison yields 0x0 for false, +/// 0xFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3205,9 +3169,9 @@ _mm_cmpeq_epi16(__m128i __a, __m128i __b) return (__m128i)((__v8hi)__a == (__v8hi)__b); } -/// \brief Compares each of the corresponding 32-bit values of the 128-bit -/// integer vectors for equality. Each comparison yields 0h for false, -/// FFFFFFFFh for true. +/// Compares each of the corresponding 32-bit values of the 128-bit +/// integer vectors for equality. Each comparison yields 0x0 for false, +/// 0xFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3224,10 +3188,10 @@ _mm_cmpeq_epi32(__m128i __a, __m128i __b) return (__m128i)((__v4si)__a == (__v4si)__b); } -/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit +/// Compares each of the corresponding signed 8-bit values of the 128-bit /// integer vectors to determine if the values in the first operand are -/// greater than those in the second operand. Each comparison yields 0h for -/// false, FFh for true. +/// greater than those in the second operand. Each comparison yields 0x0 for +/// false, 0xFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3246,11 +3210,11 @@ _mm_cmpgt_epi8(__m128i __a, __m128i __b) return (__m128i)((__v16qs)__a > (__v16qs)__b); } -/// \brief Compares each of the corresponding signed 16-bit values of the +/// Compares each of the corresponding signed 16-bit values of the /// 128-bit integer vectors to determine if the values in the first operand /// are greater than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3267,11 +3231,11 @@ _mm_cmpgt_epi16(__m128i __a, __m128i __b) return (__m128i)((__v8hi)__a > (__v8hi)__b); } -/// \brief Compares each of the corresponding signed 32-bit values of the +/// Compares each of the corresponding signed 32-bit values of the /// 128-bit integer vectors to determine if the values in the first operand /// are greater than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3288,11 +3252,11 @@ _mm_cmpgt_epi32(__m128i __a, __m128i __b) return (__m128i)((__v4si)__a > (__v4si)__b); } -/// \brief Compares each of the corresponding signed 8-bit values of the 128-bit +/// Compares each of the corresponding signed 8-bit values of the 128-bit /// integer vectors to determine if the values in the first operand are less /// than those in the second operand. /// -/// Each comparison yields 0h for false, FFh for true. +/// Each comparison yields 0x0 for false, 0xFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3309,11 +3273,11 @@ _mm_cmplt_epi8(__m128i __a, __m128i __b) return _mm_cmpgt_epi8(__b, __a); } -/// \brief Compares each of the corresponding signed 16-bit values of the +/// Compares each of the corresponding signed 16-bit values of the /// 128-bit integer vectors to determine if the values in the first operand /// are less than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3330,11 +3294,11 @@ _mm_cmplt_epi16(__m128i __a, __m128i __b) return _mm_cmpgt_epi16(__b, __a); } -/// \brief Compares each of the corresponding signed 32-bit values of the +/// Compares each of the corresponding signed 32-bit values of the /// 128-bit integer vectors to determine if the values in the first operand /// are less than those in the second operand. /// -/// Each comparison yields 0h for false, FFFFFFFFh for true. +/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. /// /// \headerfile <x86intrin.h> /// @@ -3352,7 +3316,7 @@ _mm_cmplt_epi32(__m128i __a, __m128i __b) } #ifdef __x86_64__ -/// \brief Converts a 64-bit signed integer value from the second operand into a +/// Converts a 64-bit signed integer value from the second operand into a /// double-precision value and returns it in the lower element of a [2 x /// double] vector; the upper element of the returned vector is copied from /// the upper element of the first operand. @@ -3376,7 +3340,7 @@ _mm_cvtsi64_sd(__m128d __a, long long __b) return __a; } -/// \brief Converts the first (lower) element of a vector of [2 x double] into a +/// Converts the first (lower) element of a vector of [2 x double] into a /// 64-bit signed integer value, according to the current rounding mode. /// /// \headerfile <x86intrin.h> @@ -3393,7 +3357,7 @@ _mm_cvtsd_si64(__m128d __a) return __builtin_ia32_cvtsd2si64((__v2df)__a); } -/// \brief Converts the first (lower) element of a vector of [2 x double] into a +/// Converts the first (lower) element of a vector of [2 x double] into a /// 64-bit signed integer value, truncating the result when it is inexact. /// /// \headerfile <x86intrin.h> @@ -3412,7 +3376,7 @@ _mm_cvttsd_si64(__m128d __a) } #endif -/// \brief Converts a vector of [4 x i32] into a vector of [4 x float]. +/// Converts a vector of [4 x i32] into a vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// @@ -3424,10 +3388,10 @@ _mm_cvttsd_si64(__m128d __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { - return __builtin_ia32_cvtdq2ps((__v4si)__a); + return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); } -/// \brief Converts a vector of [4 x float] into a vector of [4 x i32]. +/// Converts a vector of [4 x float] into a vector of [4 x i32]. /// /// \headerfile <x86intrin.h> /// @@ -3443,7 +3407,7 @@ _mm_cvtps_epi32(__m128 __a) return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); } -/// \brief Converts a vector of [4 x float] into a vector of [4 x i32], +/// Converts a vector of [4 x float] into a vector of [4 x i32], /// truncating the result when it is inexact. /// /// \headerfile <x86intrin.h> @@ -3460,7 +3424,7 @@ _mm_cvttps_epi32(__m128 __a) return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); } -/// \brief Returns a vector of [4 x i32] where the lowest element is the input +/// Returns a vector of [4 x i32] where the lowest element is the input /// operand and the remaining elements are zero. /// /// \headerfile <x86intrin.h> @@ -3473,11 +3437,11 @@ _mm_cvttps_epi32(__m128 __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { - return (__m128i)(__v4si){ __a, 0, 0, 0 }; + return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 }; } #ifdef __x86_64__ -/// \brief Returns a vector of [2 x i64] where the lower element is the input +/// Returns a vector of [2 x i64] where the lower element is the input /// operand and the upper element is zero. /// /// \headerfile <x86intrin.h> @@ -3490,11 +3454,11 @@ _mm_cvtsi32_si128(int __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { - return (__m128i){ __a, 0 }; + return __extension__ (__m128i)(__v2di){ __a, 0 }; } #endif -/// \brief Moves the least significant 32 bits of a vector of [4 x i32] to a +/// Moves the least significant 32 bits of a vector of [4 x i32] to a /// 32-bit signed integer value. /// /// \headerfile <x86intrin.h> @@ -3513,7 +3477,7 @@ _mm_cvtsi128_si32(__m128i __a) } #ifdef __x86_64__ -/// \brief Moves the least significant 64 bits of a vector of [2 x i64] to a +/// Moves the least significant 64 bits of a vector of [2 x i64] to a /// 64-bit signed integer value. /// /// \headerfile <x86intrin.h> @@ -3531,7 +3495,7 @@ _mm_cvtsi128_si64(__m128i __a) } #endif -/// \brief Moves packed integer values from an aligned 128-bit memory location +/// Moves packed integer values from an aligned 128-bit memory location /// to elements in a 128-bit integer vector. /// /// \headerfile <x86intrin.h> @@ -3547,7 +3511,7 @@ _mm_load_si128(__m128i const *__p) return *__p; } -/// \brief Moves packed integer values from an unaligned 128-bit memory location +/// Moves packed integer values from an unaligned 128-bit memory location /// to elements in a 128-bit integer vector. /// /// \headerfile <x86intrin.h> @@ -3566,7 +3530,7 @@ _mm_loadu_si128(__m128i const *__p) return ((struct __loadu_si128*)__p)->__v; } -/// \brief Returns a vector of [2 x i64] where the lower element is taken from +/// Returns a vector of [2 x i64] where the lower element is taken from /// the lower element of the operand, and the upper element is zero. /// /// \headerfile <x86intrin.h> @@ -3584,10 +3548,10 @@ _mm_loadl_epi64(__m128i const *__p) struct __mm_loadl_epi64_struct { long long __u; } __attribute__((__packed__, __may_alias__)); - return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; + return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; } -/// \brief Generates a 128-bit vector of [4 x i32] with unspecified content. +/// Generates a 128-bit vector of [4 x i32] with unspecified content. /// This could be used as an argument to another intrinsic function where the /// argument is required but the value is not actually used. /// @@ -3602,7 +3566,7 @@ _mm_undefined_si128(void) return (__m128i)__builtin_ia32_undef128(); } -/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with +/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with /// the specified 64-bit integer values. /// /// \headerfile <x86intrin.h> @@ -3621,10 +3585,10 @@ _mm_undefined_si128(void) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, long long __q0) { - return (__m128i){ __q0, __q1 }; + return __extension__ (__m128i)(__v2di){ __q0, __q1 }; } -/// \brief Initializes both 64-bit values in a 128-bit vector of [2 x i64] with +/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with /// the specified 64-bit integer values. /// /// \headerfile <x86intrin.h> @@ -3643,10 +3607,10 @@ _mm_set_epi64x(long long __q1, long long __q0) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, __m64 __q0) { - return (__m128i){ (long long)__q0, (long long)__q1 }; + return _mm_set_epi64x((long long)__q1, (long long)__q0); } -/// \brief Initializes the 32-bit values in a 128-bit vector of [4 x i32] with +/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with /// the specified 32-bit integer values. /// /// \headerfile <x86intrin.h> @@ -3671,10 +3635,10 @@ _mm_set_epi64(__m64 __q1, __m64 __q0) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) { - return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; + return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; } -/// \brief Initializes the 16-bit values in a 128-bit vector of [8 x i16] with +/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with /// the specified 16-bit integer values. /// /// \headerfile <x86intrin.h> @@ -3711,10 +3675,10 @@ _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) { - return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; + return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; } -/// \brief Initializes the 8-bit values in a 128-bit vector of [16 x i8] with +/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with /// the specified 8-bit integer values. /// /// \headerfile <x86intrin.h> @@ -3759,10 +3723,10 @@ _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) { - return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; + return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; } -/// \brief Initializes both values in a 128-bit integer vector with the +/// Initializes both values in a 128-bit integer vector with the /// specified 64-bit integer value. /// /// \headerfile <x86intrin.h> @@ -3778,10 +3742,10 @@ _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { - return (__m128i){ __q, __q }; + return _mm_set_epi64x(__q, __q); } -/// \brief Initializes both values in a 128-bit vector of [2 x i64] with the +/// Initializes both values in a 128-bit vector of [2 x i64] with the /// specified 64-bit value. /// /// \headerfile <x86intrin.h> @@ -3797,10 +3761,10 @@ _mm_set1_epi64x(long long __q) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { - return (__m128i){ (long long)__q, (long long)__q }; + return _mm_set_epi64(__q, __q); } -/// \brief Initializes all values in a 128-bit vector of [4 x i32] with the +/// Initializes all values in a 128-bit vector of [4 x i32] with the /// specified 32-bit value. /// /// \headerfile <x86intrin.h> @@ -3816,10 +3780,10 @@ _mm_set1_epi64(__m64 __q) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { - return (__m128i)(__v4si){ __i, __i, __i, __i }; + return _mm_set_epi32(__i, __i, __i, __i); } -/// \brief Initializes all values in a 128-bit vector of [8 x i16] with the +/// Initializes all values in a 128-bit vector of [8 x i16] with the /// specified 16-bit value. /// /// \headerfile <x86intrin.h> @@ -3835,10 +3799,10 @@ _mm_set1_epi32(int __i) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { - return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w }; + return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); } -/// \brief Initializes all values in a 128-bit vector of [16 x i8] with the +/// Initializes all values in a 128-bit vector of [16 x i8] with the /// specified 8-bit value. /// /// \headerfile <x86intrin.h> @@ -3854,10 +3818,10 @@ _mm_set1_epi16(short __w) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { - return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b }; + return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); } -/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// Constructs a 128-bit integer vector, initialized in reverse order /// with the specified 64-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3874,10 +3838,10 @@ _mm_set1_epi8(char __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, __m64 __q1) { - return (__m128i){ (long long)__q0, (long long)__q1 }; + return _mm_set_epi64(__q1, __q0); } -/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// Constructs a 128-bit integer vector, initialized in reverse order /// with the specified 32-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3897,10 +3861,10 @@ _mm_setr_epi64(__m64 __q0, __m64 __q1) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) { - return (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; + return _mm_set_epi32(__i3, __i2, __i1, __i0); } -/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// Constructs a 128-bit integer vector, initialized in reverse order /// with the specified 16-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3928,10 +3892,10 @@ _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) { - return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; + return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); } -/// \brief Constructs a 128-bit integer vector, initialized in reverse order +/// Constructs a 128-bit integer vector, initialized in reverse order /// with the specified 8-bit integral values. /// /// \headerfile <x86intrin.h> @@ -3975,10 +3939,10 @@ _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) { - return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; + return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); } -/// \brief Creates a 128-bit integer vector initialized to zero. +/// Creates a 128-bit integer vector initialized to zero. /// /// \headerfile <x86intrin.h> /// @@ -3989,10 +3953,10 @@ _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { - return (__m128i){ 0LL, 0LL }; + return __extension__ (__m128i)(__v2di){ 0LL, 0LL }; } -/// \brief Stores a 128-bit integer vector to a memory location aligned on a +/// Stores a 128-bit integer vector to a memory location aligned on a /// 128-bit boundary. /// /// \headerfile <x86intrin.h> @@ -4010,7 +3974,7 @@ _mm_store_si128(__m128i *__p, __m128i __b) *__p = __b; } -/// \brief Stores a 128-bit integer vector to an unaligned memory location. +/// Stores a 128-bit integer vector to an unaligned memory location. /// /// \headerfile <x86intrin.h> /// @@ -4029,7 +3993,7 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) ((struct __storeu_si128*)__p)->__v = __b; } -/// \brief Moves bytes selected by the mask from the first operand to the +/// Moves bytes selected by the mask from the first operand to the /// specified unaligned memory location. When a mask bit is 1, the /// corresponding byte is written, otherwise it is not written. /// @@ -4056,7 +4020,7 @@ _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); } -/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to +/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to /// a memory location. /// /// \headerfile <x86intrin.h> @@ -4078,7 +4042,7 @@ _mm_storel_epi64(__m128i *__p, __m128i __a) ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; } -/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit +/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit /// aligned memory location. /// /// To minimize caching, the data is flagged as non-temporal (unlikely to be @@ -4098,7 +4062,7 @@ _mm_stream_pd(double *__p, __m128d __a) __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); } -/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location. +/// Stores a 128-bit integer vector to a 128-bit aligned memory location. /// /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). @@ -4117,7 +4081,7 @@ _mm_stream_si128(__m128i *__p, __m128i __a) __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); } -/// \brief Stores a 32-bit integer value in the specified memory location. +/// Stores a 32-bit integer value in the specified memory location. /// /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). @@ -4130,14 +4094,14 @@ _mm_stream_si128(__m128i *__p, __m128i __a) /// A pointer to the 32-bit memory location used to store the value. /// \param __a /// A 32-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) _mm_stream_si32(int *__p, int __a) { __builtin_ia32_movnti(__p, __a); } #ifdef __x86_64__ -/// \brief Stores a 64-bit integer value in the specified memory location. +/// Stores a 64-bit integer value in the specified memory location. /// /// To minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). @@ -4150,7 +4114,7 @@ _mm_stream_si32(int *__p, int __a) /// A pointer to the 64-bit memory location used to store the value. /// \param __a /// A 64-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) _mm_stream_si64(long long *__p, long long __a) { __builtin_ia32_movnti64(__p, __a); @@ -4161,7 +4125,7 @@ _mm_stream_si64(long long *__p, long long __a) extern "C" { #endif -/// \brief The cache line containing \a __p is flushed and invalidated from all +/// The cache line containing \a __p is flushed and invalidated from all /// caches in the coherency domain. /// /// \headerfile <x86intrin.h> @@ -4173,7 +4137,7 @@ extern "C" { /// flushed. void _mm_clflush(void const * __p); -/// \brief Forces strong memory ordering (serialization) between load +/// Forces strong memory ordering (serialization) between load /// instructions preceding this instruction and load instructions following /// this instruction, ensuring the system completes all previous loads before /// executing subsequent loads. @@ -4184,7 +4148,7 @@ void _mm_clflush(void const * __p); /// void _mm_lfence(void); -/// \brief Forces strong memory ordering (serialization) between load and store +/// Forces strong memory ordering (serialization) between load and store /// instructions preceding this instruction and load and store instructions /// following this instruction, ensuring that the system completes all /// previous memory accesses before executing subsequent memory accesses. @@ -4199,7 +4163,7 @@ void _mm_mfence(void); } // extern "C" #endif -/// \brief Converts 16-bit signed integers from both 128-bit integer vector +/// Converts 16-bit signed integers from both 128-bit integer vector /// operands into 8-bit signed integers, and packs the results into the /// destination. Positive values greater than 0x7F are saturated to 0x7F. /// Negative values less than 0x80 are saturated to 0x80. @@ -4227,7 +4191,7 @@ _mm_packs_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); } -/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// Converts 32-bit signed integers from both 128-bit integer vector /// operands into 16-bit signed integers, and packs the results into the /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. /// Negative values less than 0x8000 are saturated to 0x8000. @@ -4255,7 +4219,7 @@ _mm_packs_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); } -/// \brief Converts 16-bit signed integers from both 128-bit integer vector +/// Converts 16-bit signed integers from both 128-bit integer vector /// operands into 8-bit unsigned integers, and packs the results into the /// destination. Values greater than 0xFF are saturated to 0xFF. Values less /// than 0x00 are saturated to 0x00. @@ -4283,7 +4247,7 @@ _mm_packus_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); } -/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using +/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using /// the immediate-value parameter as a selector. /// /// \headerfile <x86intrin.h> @@ -4305,14 +4269,11 @@ _mm_packus_epi16(__m128i __a, __m128i __b) /// 111: assign values from bits [127:112] of \a __a. /// \returns An integer, whose lower 16 bits are selected from the 128-bit /// integer vector parameter and the remaining bits are assigned zeros. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_extract_epi16(__m128i __a, int __imm) -{ - __v8hi __b = (__v8hi)__a; - return (unsigned short)__b[__imm & 7]; -} +#define _mm_extract_epi16(a, imm) \ + (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ + (int)(imm)) -/// \brief Constructs a 128-bit integer vector by first making a copy of the +/// Constructs a 128-bit integer vector by first making a copy of the /// 128-bit integer vector parameter, and then inserting the lower 16 bits /// of an integer parameter into an offset specified by the immediate-value /// parameter. @@ -4332,15 +4293,11 @@ _mm_extract_epi16(__m128i __a, int __imm) /// An immediate value specifying the bit offset in the result at which the /// lower 16 bits of \a __b are written. /// \returns A 128-bit integer vector containing the constructed values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_insert_epi16(__m128i __a, int __b, int __imm) -{ - __v8hi __c = (__v8hi)__a; - __c[__imm & 7] = __b; - return (__m128i)__c; -} +#define _mm_insert_epi16(a, b, imm) \ + (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ + (int)(imm)) -/// \brief Copies the values of the most significant bits from each 8-bit +/// Copies the values of the most significant bits from each 8-bit /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask /// value, zero-extends the value, and writes it to the destination. /// @@ -4358,7 +4315,7 @@ _mm_movemask_epi8(__m128i __a) return __builtin_ia32_pmovmskb128((__v16qi)__a); } -/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit +/// Constructs a 128-bit integer vector by shuffling four 32-bit /// elements of a 128-bit integer vector parameter, using the immediate-value /// parameter as a specifier. /// @@ -4386,13 +4343,10 @@ _mm_movemask_epi8(__m128i __a) /// 10: assign values from bits [95:64] of \a a. \n /// 11: assign values from bits [127:96] of \a a. /// \returns A 128-bit integer vector containing the shuffled values. -#define _mm_shuffle_epi32(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \ - (__v4si)_mm_undefined_si128(), \ - ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ - ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); }) +#define _mm_shuffle_epi32(a, imm) \ + (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)) -/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit +/// Constructs a 128-bit integer vector by shuffling four lower 16-bit /// elements of a 128-bit integer vector of [8 x i16], using the immediate /// value parameter as a specifier. /// @@ -4419,14 +4373,10 @@ _mm_movemask_epi8(__m128i __a) /// 10: assign values from bits [47:32] of \a a. \n /// 11: assign values from bits [63:48] of \a a. \n /// \returns A 128-bit integer vector containing the shuffled values. -#define _mm_shufflelo_epi16(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ - (__v8hi)_mm_undefined_si128(), \ - ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \ - ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \ - 4, 5, 6, 7); }) - -/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit +#define _mm_shufflelo_epi16(a, imm) \ + (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)) + +/// Constructs a 128-bit integer vector by shuffling four upper 16-bit /// elements of a 128-bit integer vector of [8 x i16], using the immediate /// value parameter as a specifier. /// @@ -4453,16 +4403,10 @@ _mm_movemask_epi8(__m128i __a) /// 10: assign values from bits [111:96] of \a a. \n /// 11: assign values from bits [127:112] of \a a. \n /// \returns A 128-bit integer vector containing the shuffled values. -#define _mm_shufflehi_epi16(a, imm) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \ - (__v8hi)_mm_undefined_si128(), \ - 0, 1, 2, 3, \ - 4 + (((imm) >> 0) & 0x3), \ - 4 + (((imm) >> 2) & 0x3), \ - 4 + (((imm) >> 4) & 0x3), \ - 4 + (((imm) >> 6) & 0x3)); }) - -/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors +#define _mm_shufflehi_epi16(a, imm) \ + (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)) + +/// Unpacks the high-order (index 8-15) values from two 128-bit vectors /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. /// /// \headerfile <x86intrin.h> @@ -4497,7 +4441,7 @@ _mm_unpackhi_epi8(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); } -/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of +/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. /// /// \headerfile <x86intrin.h> @@ -4524,7 +4468,7 @@ _mm_unpackhi_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); } -/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of +/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. /// /// \headerfile <x86intrin.h> @@ -4547,7 +4491,7 @@ _mm_unpackhi_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); } -/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// Unpacks the high-order 64-bit elements from two 128-bit vectors of /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. /// /// \headerfile <x86intrin.h> @@ -4568,7 +4512,7 @@ _mm_unpackhi_epi64(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); } -/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of +/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. /// /// \headerfile <x86intrin.h> @@ -4603,7 +4547,7 @@ _mm_unpacklo_epi8(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); } -/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit +/// Unpacks the low-order (index 0-3) values from each of the two 128-bit /// vectors of [8 x i16] and interleaves them into a 128-bit vector of /// [8 x i16]. /// @@ -4631,7 +4575,7 @@ _mm_unpacklo_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); } -/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. /// /// \headerfile <x86intrin.h> @@ -4654,7 +4598,7 @@ _mm_unpacklo_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); } -/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of +/// Unpacks the low-order 64-bit elements from two 128-bit vectors of /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. /// /// \headerfile <x86intrin.h> @@ -4675,7 +4619,7 @@ _mm_unpacklo_epi64(__m128i __a, __m128i __b) return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); } -/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit +/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit /// integer. /// /// \headerfile <x86intrin.h> @@ -4692,7 +4636,7 @@ _mm_movepi64_pi64(__m128i __a) return (__m64)__a[0]; } -/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the +/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the /// upper bits. /// /// \headerfile <x86intrin.h> @@ -4706,10 +4650,10 @@ _mm_movepi64_pi64(__m128i __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { - return (__m128i){ (long long)__a, 0 }; + return __extension__ (__m128i)(__v2di){ (long long)__a, 0 }; } -/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit +/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit /// integer vector, zeroing the upper bits. /// /// \headerfile <x86intrin.h> @@ -4724,10 +4668,10 @@ _mm_movpi64_epi64(__m64 __a) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { - return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2); + return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); } -/// \brief Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// Unpacks the high-order 64-bit elements from two 128-bit vectors of /// [2 x double] and interleaves them into a 128-bit vector of [2 x /// double]. /// @@ -4748,7 +4692,7 @@ _mm_unpackhi_pd(__m128d __a, __m128d __b) return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); } -/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors +/// Unpacks the low-order 64-bit elements from two 128-bit vectors /// of [2 x double] and interleaves them into a 128-bit vector of [2 x /// double]. /// @@ -4769,7 +4713,7 @@ _mm_unpacklo_pd(__m128d __a, __m128d __b) return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); } -/// \brief Extracts the sign bits of the double-precision values in the 128-bit +/// Extracts the sign bits of the double-precision values in the 128-bit /// vector of [2 x double], zero-extends the value, and writes it to the /// low-order bits of the destination. /// @@ -4789,7 +4733,7 @@ _mm_movemask_pd(__m128d __a) } -/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two +/// Constructs a 128-bit floating-point vector of [2 x double] from two /// 128-bit vector parameters of [2 x double], using the immediate-value /// parameter as a specifier. /// @@ -4813,12 +4757,11 @@ _mm_movemask_pd(__m128d __a) /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n /// \returns A 128-bit vector of [2 x double] containing the shuffled values. -#define _mm_shuffle_pd(a, b, i) __extension__ ({ \ - (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ - 0 + (((i) >> 0) & 0x1), \ - 2 + (((i) >> 1) & 0x1)); }) +#define _mm_shuffle_pd(a, b, i) \ + (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ + (int)(i)) -/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit +/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit /// floating-point vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -4835,7 +4778,7 @@ _mm_castpd_ps(__m128d __a) return (__m128)__a; } -/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit +/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit /// integer vector. /// /// \headerfile <x86intrin.h> @@ -4852,7 +4795,7 @@ _mm_castpd_si128(__m128d __a) return (__m128i)__a; } -/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit +/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit /// floating-point vector of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -4869,7 +4812,7 @@ _mm_castps_pd(__m128 __a) return (__m128d)__a; } -/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit +/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit /// integer vector. /// /// \headerfile <x86intrin.h> @@ -4886,7 +4829,7 @@ _mm_castps_si128(__m128 __a) return (__m128i)__a; } -/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector +/// Casts a 128-bit integer vector into a 128-bit floating-point vector /// of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -4903,7 +4846,7 @@ _mm_castsi128_ps(__m128i __a) return (__m128)__a; } -/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector +/// Casts a 128-bit integer vector into a 128-bit floating-point vector /// of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -4924,7 +4867,7 @@ _mm_castsi128_pd(__m128i __a) extern "C" { #endif -/// \brief Indicates that a spin loop is being executed for the purposes of +/// Indicates that a spin loop is being executed for the purposes of /// optimizing power consumption during the loop. /// /// \headerfile <x86intrin.h> @@ -4937,6 +4880,7 @@ void _mm_pause(void); } // extern "C" #endif #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_MMX #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) diff --git a/c_headers/f16cintrin.h b/c_headers/f16cintrin.h index b796cc8431..3d35f28eb3 100644 --- a/c_headers/f16cintrin.h +++ b/c_headers/f16cintrin.h @@ -21,18 +21,25 @@ *===-----------------------------------------------------------------------=== */ -#if !defined __X86INTRIN_H && !defined __EMMINTRIN_H && !defined __IMMINTRIN_H -#error "Never use <f16cintrin.h> directly; include <emmintrin.h> instead." +#if !defined __IMMINTRIN_H +#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead." #endif #ifndef __F16CINTRIN_H #define __F16CINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("f16c"))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256))) -/// \brief Converts a 16-bit half-precision float value into a 32-bit float +/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h, + * but that's because icc can emulate these without f16c using a library call. + * Since we don't do that let's leave these in f16cintrin.h. + */ + +/// Converts a 16-bit half-precision float value into a 32-bit float /// value. /// /// \headerfile <x86intrin.h> @@ -42,7 +49,7 @@ /// \param __a /// A 16-bit half-precision float value. /// \returns The converted 32-bit float value. -static __inline float __DEFAULT_FN_ATTRS +static __inline float __DEFAULT_FN_ATTRS128 _cvtsh_ss(unsigned short __a) { __v8hi v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; @@ -50,7 +57,7 @@ _cvtsh_ss(unsigned short __a) return r[0]; } -/// \brief Converts a 32-bit single-precision float value to a 16-bit +/// Converts a 32-bit single-precision float value to a 16-bit /// half-precision float value. /// /// \headerfile <x86intrin.h> @@ -72,11 +79,11 @@ _cvtsh_ss(unsigned short __a) /// 011: Truncate \n /// 1XX: Use MXCSR.RC for rounding /// \returns The converted 16-bit half-precision float value. -#define _cvtss_sh(a, imm) __extension__ ({ \ +#define _cvtss_sh(a, imm) \ (unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ - (imm)))[0]); }) + (imm)))[0]) -/// \brief Converts a 128-bit vector containing 32-bit float values into a +/// Converts a 128-bit vector containing 32-bit float values into a /// 128-bit vector containing 16-bit half-precision float values. /// /// \headerfile <x86intrin.h> @@ -99,10 +106,10 @@ _cvtsh_ss(unsigned short __a) /// \returns A 128-bit vector containing converted 16-bit half-precision float /// values. The lower 64 bits are used to store the converted 16-bit /// half-precision floating-point values. -#define _mm_cvtps_ph(a, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)); }) +#define _mm_cvtps_ph(a, imm) \ + (__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)) -/// \brief Converts a 128-bit vector containing 16-bit half-precision float +/// Converts a 128-bit vector containing 16-bit half-precision float /// values into a 128-bit vector containing 32-bit float values. /// /// \headerfile <x86intrin.h> @@ -113,12 +120,57 @@ _cvtsh_ss(unsigned short __a) /// A 128-bit vector containing 16-bit half-precision float values. The lower /// 64 bits are used in the conversion. /// \returns A 128-bit vector of [4 x float] containing converted float values. -static __inline __m128 __DEFAULT_FN_ATTRS +static __inline __m128 __DEFAULT_FN_ATTRS128 _mm_cvtph_ps(__m128i __a) { return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a); } -#undef __DEFAULT_FN_ATTRS +/// Converts a 256-bit vector of [8 x float] into a 128-bit vector +/// containing 16-bit half-precision float values. +/// +/// \headerfile <x86intrin.h> +/// +/// \code +/// __m128i _mm256_cvtps_ph(__m256 a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction. +/// +/// \param a +/// A 256-bit vector containing 32-bit single-precision float values to be +/// converted to 16-bit half-precision float values. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n +/// 1XX: Use MXCSR.RC for rounding +/// \returns A 128-bit vector containing the converted 16-bit half-precision +/// float values. +#define _mm256_cvtps_ph(a, imm) \ + (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)) + +/// Converts a 128-bit vector containing 16-bit half-precision float +/// values into a 256-bit vector of [8 x float]. +/// +/// \headerfile <x86intrin.h> +/// +/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction. +/// +/// \param __a +/// A 128-bit vector containing 16-bit half-precision float values to be +/// converted to 32-bit single-precision float values. +/// \returns A vector of [8 x float] containing the converted 32-bit +/// single-precision float values. +static __inline __m256 __DEFAULT_FN_ATTRS256 +_mm256_cvtph_ps(__m128i __a) +{ + return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif /* __F16CINTRIN_H */ diff --git a/c_headers/fma4intrin.h b/c_headers/fma4intrin.h index 962b1a60a2..7bae2f4a31 100644 --- a/c_headers/fma4intrin.h +++ b/c_headers/fma4intrin.h @@ -31,200 +31,202 @@ #include <pmmintrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma4"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256))) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif /* __FMA4INTRIN_H */ diff --git a/c_headers/fmaintrin.h b/c_headers/fmaintrin.h index 478a0ac81c..094d13afea 100644 --- a/c_headers/fmaintrin.h +++ b/c_headers/fmaintrin.h @@ -1,4 +1,4 @@ -/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------=== +/*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -29,200 +29,202 @@ #define __FMAINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fma"))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); } -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) { return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); } -static __inline__ __m128d __DEFAULT_FN_ATTRS +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) { return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) { return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) { return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); } -#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 #endif /* __FMAINTRIN_H */ diff --git a/c_headers/fxsrintrin.h b/c_headers/fxsrintrin.h index 786081ca8e..704b5ad60a 100644 --- a/c_headers/fxsrintrin.h +++ b/c_headers/fxsrintrin.h @@ -30,7 +30,7 @@ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr"))) -/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte +/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte /// memory region pointed to by the input parameter \a __p. /// /// \headerfile <x86intrin.h> @@ -43,10 +43,10 @@ static __inline__ void __DEFAULT_FN_ATTRS _fxsave(void *__p) { - return __builtin_ia32_fxsave(__p); + __builtin_ia32_fxsave(__p); } -/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte +/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte /// memory region pointed to by the input parameter \a __p. The contents of /// this memory region should have been written to by a previous \c _fxsave /// or \c _fxsave64 intrinsic. @@ -61,11 +61,11 @@ _fxsave(void *__p) static __inline__ void __DEFAULT_FN_ATTRS _fxrstor(void *__p) { - return __builtin_ia32_fxrstor(__p); + __builtin_ia32_fxrstor(__p); } #ifdef __x86_64__ -/// \brief Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte +/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte /// memory region pointed to by the input parameter \a __p. /// /// \headerfile <x86intrin.h> @@ -78,10 +78,10 @@ _fxrstor(void *__p) static __inline__ void __DEFAULT_FN_ATTRS _fxsave64(void *__p) { - return __builtin_ia32_fxsave64(__p); + __builtin_ia32_fxsave64(__p); } -/// \brief Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte +/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte /// memory region pointed to by the input parameter \a __p. The contents of /// this memory region should have been written to by a previous \c _fxsave /// or \c _fxsave64 intrinsic. @@ -96,7 +96,7 @@ _fxsave64(void *__p) static __inline__ void __DEFAULT_FN_ATTRS _fxrstor64(void *__p) { - return __builtin_ia32_fxrstor64(__p); + __builtin_ia32_fxrstor64(__p); } #endif diff --git a/c_headers/gfniintrin.h b/c_headers/gfniintrin.h index 20fadccfaa..804d4f3d06 100644 --- a/c_headers/gfniintrin.h +++ b/c_headers/gfniintrin.h @@ -29,104 +29,108 @@ #define __GFNIINTRIN_H -#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ +#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ (__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ +#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S)); }) + (__v16qi)(__m128i)(S)) -#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ +#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ (__m128i)_mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I); }) + U, A, B, I) -#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \ (__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ (__v32qi)(__m256i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ +#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S)); }) + (__v32qi)(__m256i)(S)) -#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ (__m256i)_mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I); }) + U, A, B, I) -#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) __extension__ ({ \ +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \ (__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ (__v64qi)(__m512i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) __extension__ ({ \ +#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v64qi)(__m512i)(S)); }) + (__v64qi)(__m512i)(S)) -#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) __extension__ ({ \ - (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_qi(), \ - U, A, B, I); }) +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ + (__m512i)_mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ + U, A, B, I) -#define _mm_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ +#define _mm_gf2p8affine_epi64_epi8(A, B, I) \ (__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ +#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ (__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S)); }) + (__v16qi)(__m128i)(S)) -#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ +#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ (__m128i)_mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I); }) + U, A, B, I) -#define _mm256_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ +#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \ (__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ (__v32qi)(__m256i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ +#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ (__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S)); }) + (__v32qi)(__m256i)(S)) -#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ +#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ (__m256i)_mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I); }) + U, A, B, I) -#define _mm512_gf2p8affine_epi64_epi8(A, B, I) __extension__ ({ \ +#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \ (__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ (__v64qi)(__m512i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) __extension__ ({ \ +#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ (__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ (__v64qi)_mm512_gf2p8affine_epi64_epi8(A, B, I), \ - (__v64qi)(__m512i)(S)); }) + (__v64qi)(__m512i)(S)) -#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) __extension__ ({ \ - (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_qi(), \ - U, A, B, I); }) +#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ + (__m512i)_mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ + U, A, B, I) /* Default attributes for simple form (no masking). */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128))) + +/* Default attributes for YMM unmasked form. */ +#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256))) /* Default attributes for ZMM forms. */ -#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"))) +#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) /* Default attributes for VLX forms. */ -#define __DEFAULT_FN_ATTRS_VL __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"))) +#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) @@ -135,7 +139,7 @@ _mm_gf2p8mul_epi8(__m128i __A, __m128i __B) (__v16qi) __B); } -static __inline__ __m128i __DEFAULT_FN_ATTRS_VL +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) { return (__m128i) __builtin_ia32_selectb_128(__U, @@ -143,21 +147,21 @@ _mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) (__v16qi) __S); } -static __inline__ __m128i __DEFAULT_FN_ATTRS_VL +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 _mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) { return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), __U, __A, __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS_Y _mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, (__v32qi) __B); } -static __inline__ __m256i __DEFAULT_FN_ATTRS_VL +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 _mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) { return (__m256i) __builtin_ia32_selectb_256(__U, @@ -165,21 +169,21 @@ _mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) (__v32qi) __S); } -static __inline__ __m256i __DEFAULT_FN_ATTRS_VL +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 _mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) { return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), __U, __A, __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_F +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A, (__v64qi) __B); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_F +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) { return (__m512i) __builtin_ia32_selectb_512(__U, @@ -187,16 +191,18 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) (__v64qi) __S); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_F +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_qi(), + return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(), __U, __A, __B); } #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_F -#undef __DEFAULT_FN_ATTRS_VL +#undef __DEFAULT_FN_ATTRS_Y +#undef __DEFAULT_FN_ATTRS_Z +#undef __DEFAULT_FN_ATTRS_VL128 +#undef __DEFAULT_FN_ATTRS_VL256 -#endif // __GFNIINTRIN_H +#endif /* __GFNIINTRIN_H */ diff --git a/c_headers/htmxlintrin.h b/c_headers/htmxlintrin.h index 28f7d025bb..049dbd61df 100644 --- a/c_headers/htmxlintrin.h +++ b/c_headers/htmxlintrin.h @@ -214,7 +214,7 @@ __TM_failure_code(void* const __TM_buff) /* These intrinsics are being made available for compatibility with the IBM XL compiler. For documentation please see the "z/OS XL - C/C++ Programming Guide" publically available on the web. */ + C/C++ Programming Guide" publicly available on the web. */ static __inline long __attribute__((__always_inline__, __nodebug__)) __TM_simple_begin () diff --git a/c_headers/ia32intrin.h b/c_headers/ia32intrin.h index 4928300103..f8972e3053 100644 --- a/c_headers/ia32intrin.h +++ b/c_headers/ia32intrin.h @@ -70,4 +70,9 @@ __rdtscp(unsigned int *__A) { #define _rdpmc(A) __rdpmc(A) +static __inline__ void __attribute__((__always_inline__, __nodebug__)) +_wbinvd(void) { + __builtin_ia32_wbinvd(); +} + #endif /* __IA32INTRIN_H */ diff --git a/c_headers/immintrin.h b/c_headers/immintrin.h index d3421dc86c..e7bfbf964d 100644 --- a/c_headers/immintrin.h +++ b/c_headers/immintrin.h @@ -68,55 +68,11 @@ #if !defined(_MSC_VER) || __has_feature(modules) || defined(__AVX2__) #include <avx2intrin.h> +#endif -/* The 256-bit versions of functions in f16cintrin.h. - Intel documents these as being in immintrin.h, and - they depend on typedefs from avxintrin.h. */ - -/// \brief Converts a 256-bit vector of [8 x float] into a 128-bit vector -/// containing 16-bit half-precision float values. -/// -/// \headerfile <x86intrin.h> -/// -/// \code -/// __m128i _mm256_cvtps_ph(__m256 a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction. -/// -/// \param a -/// A 256-bit vector containing 32-bit single-precision float values to be -/// converted to 16-bit half-precision float values. -/// \param imm -/// An immediate value controlling rounding using bits [2:0]: \n -/// 000: Nearest \n -/// 001: Down \n -/// 010: Up \n -/// 011: Truncate \n -/// 1XX: Use MXCSR.RC for rounding -/// \returns A 128-bit vector containing the converted 16-bit half-precision -/// float values. -#define _mm256_cvtps_ph(a, imm) __extension__ ({ \ - (__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)); }) - -/// \brief Converts a 128-bit vector containing 16-bit half-precision float -/// values into a 256-bit vector of [8 x float]. -/// -/// \headerfile <x86intrin.h> -/// -/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction. -/// -/// \param __a -/// A 128-bit vector containing 16-bit half-precision float values to be -/// converted to 32-bit single-precision float values. -/// \returns A vector of [8 x float] containing the converted 32-bit -/// single-precision float values. -static __inline __m256 __attribute__((__always_inline__, __nodebug__, __target__("f16c"))) -_mm256_cvtph_ps(__m128i __a) -{ - return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a); -} -#endif /* __AVX2__ */ +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__) +#include <f16cintrin.h> +#endif #if !defined(_MSC_VER) || __has_feature(modules) || defined(__VPCLMULQDQ__) #include <vpclmulqdqintrin.h> @@ -134,6 +90,10 @@ _mm256_cvtph_ps(__m128i __a) #include <lzcntintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__) +#include <popcntintrin.h> +#endif + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__FMA__) #include <fmaintrin.h> #endif @@ -247,6 +207,18 @@ _mm256_cvtph_ps(__m128i __a) #include <gfniintrin.h> #endif +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDPID__) +/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103). +/// +/// \headerfile <immintrin.h> +/// +/// This intrinsic corresponds to the <c> RDPID </c> instruction. +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid"))) +_rdpid_u32(void) { + return __builtin_ia32_rdpid(); +} +#endif // __RDPID__ + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDRND__) static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) _rdrand16_step(unsigned short *__p) @@ -310,25 +282,25 @@ _readgsbase_u64(void) static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) _writefsbase_u32(unsigned int __V) { - return __builtin_ia32_wrfsbase32(__V); + __builtin_ia32_wrfsbase32(__V); } static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) _writefsbase_u64(unsigned long long __V) { - return __builtin_ia32_wrfsbase64(__V); + __builtin_ia32_wrfsbase64(__V); } static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) _writegsbase_u32(unsigned int __V) { - return __builtin_ia32_wrgsbase32(__V); + __builtin_ia32_wrgsbase32(__V); } static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) _writegsbase_u64(unsigned long long __V) { - return __builtin_ia32_wrgsbase64(__V); + __builtin_ia32_wrgsbase64(__V); } #endif @@ -371,4 +343,125 @@ _writegsbase_u64(unsigned long long __V) * whereas others are also available at all times. */ #include <adxintrin.h> +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__) +#include <rdseedintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WBNOINVD__) +#include <wbnoinvdintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__CLDEMOTE__) +#include <cldemoteintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__WAITPKG__) +#include <waitpkgintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || \ + defined(__MOVDIRI__) || defined(__MOVDIR64B__) +#include <movdirintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PCONFIG__) +#include <pconfigintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__SGX__) +#include <sgxintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__PTWRITE__) +#include <ptwriteintrin.h> +#endif + +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__INVPCID__) +#include <invpcidintrin.h> +#endif + +#ifdef _MSC_VER +/* Define the default attributes for these intrinsics */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#ifdef __cplusplus +extern "C" { +#endif +/*----------------------------------------------------------------------------*\ +|* Interlocked Exchange HLE +\*----------------------------------------------------------------------------*/ +#if defined(__i386__) || defined(__x86_64__) +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) { + __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) { + __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +#endif +#if defined(__x86_64__) +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) { + __asm__ __volatile__(".byte 0xf2 ; lock ; xchg %0, %1" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) { + __asm__ __volatile__(".byte 0xf3 ; lock ; xchg %0, %1" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +#endif +/*----------------------------------------------------------------------------*\ +|* Interlocked Compare Exchange HLE +\*----------------------------------------------------------------------------*/ +#if defined(__i386__) || defined(__x86_64__) +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination, + long _Exchange, long _Comparand) { + __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_HLERelease(long volatile *_Destination, + long _Exchange, long _Comparand) { + __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +#endif +#if defined(__x86_64__) +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg %2, %1" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg %2, %1" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +#endif +#ifdef __cplusplus +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* _MSC_VER */ + #endif /* __IMMINTRIN_H */ diff --git a/c_headers/intrin.h b/c_headers/intrin.h index b30aa215a4..91914214e2 100644 --- a/c_headers/intrin.h +++ b/c_headers/intrin.h @@ -38,7 +38,7 @@ #include <armintr.h> #endif -#if defined(_M_ARM64) +#if defined(__aarch64__) #include <arm64intr.h> #endif @@ -83,6 +83,7 @@ void __incfsdword(unsigned long); void __incfsword(unsigned long); unsigned long __indword(unsigned short); void __indwordstring(unsigned short, unsigned long *, unsigned long); +void __int2c(void); void __invlpg(void *); unsigned short __inword(unsigned short); void __inwordstring(unsigned short, unsigned short *, unsigned long); @@ -140,6 +141,7 @@ void __svm_stgi(void); void __svm_vmload(size_t); void __svm_vmrun(size_t); void __svm_vmsave(size_t); +void __ud2(void); unsigned __int64 __ull_rshift(unsigned __int64, int); void __vmx_off(void); void __vmx_vmptrst(unsigned __int64 *); @@ -161,25 +163,15 @@ static __inline__ unsigned char _BitScanForward(unsigned long *_Index, unsigned long _Mask); static __inline__ unsigned char _BitScanReverse(unsigned long *_Index, unsigned long _Mask); -static __inline__ unsigned char _bittest(long const *, long); -static __inline__ unsigned char _bittestandcomplement(long *, long); -static __inline__ unsigned char _bittestandreset(long *, long); -static __inline__ unsigned char _bittestandset(long *, long); void __cdecl _disable(void); void __cdecl _enable(void); long _InterlockedAddLargeStatistic(__int64 volatile *_Addend, long _Value); unsigned char _interlockedbittestandreset(long volatile *, long); unsigned char _interlockedbittestandset(long volatile *, long); -long _InterlockedCompareExchange_HLEAcquire(long volatile *, long, long); -long _InterlockedCompareExchange_HLERelease(long volatile *, long, long); -__int64 _InterlockedcompareExchange64_HLEAcquire(__int64 volatile *, __int64, - __int64); -__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64, - __int64); void *_InterlockedCompareExchangePointer_HLEAcquire(void *volatile *, void *, void *); void *_InterlockedCompareExchangePointer_HLERelease(void *volatile *, void *, @@ -256,24 +248,15 @@ void __writegsbyte(unsigned long, unsigned char); void __writegsdword(unsigned long, unsigned long); void __writegsqword(unsigned long, unsigned __int64); void __writegsword(unsigned long, unsigned short); -static __inline__ -unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); -static __inline__ -unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); -static __inline__ unsigned char _bittest64(__int64 const *, __int64); -static __inline__ unsigned char _bittestandcomplement64(__int64 *, __int64); -static __inline__ unsigned char _bittestandreset64(__int64 *, __int64); -static __inline__ unsigned char _bittestandset64(__int64 *, __int64); long _InterlockedAnd_np(long volatile *_Value, long _Mask); short _InterlockedAnd16_np(short volatile *_Value, short _Mask); __int64 _InterlockedAnd64_np(__int64 volatile *_Value, __int64 _Mask); char _InterlockedAnd8_np(char volatile *_Value, char _Mask); unsigned char _interlockedbittestandreset64(__int64 volatile *, __int64); -static __inline__ unsigned char _interlockedbittestandset64(__int64 volatile *, __int64); long _InterlockedCompareExchange_np(long volatile *_Destination, long _Exchange, long _Comparand); @@ -287,10 +270,6 @@ unsigned char _InterlockedCompareExchange128_np(__int64 volatile *_Destination, __int64 *_ComparandResult); short _InterlockedCompareExchange16_np(short volatile *_Destination, short _Exchange, short _Comparand); -__int64 _InterlockedCompareExchange64_HLEAcquire(__int64 volatile *, __int64, - __int64); -__int64 _InterlockedCompareExchange64_HLERelease(__int64 volatile *, __int64, - __int64); __int64 _InterlockedCompareExchange64_np(__int64 volatile *_Destination, __int64 _Exchange, __int64 _Comparand); void *_InterlockedCompareExchangePointer_np(void *volatile *_Destination, @@ -320,7 +299,12 @@ unsigned __int64 _umul128(unsigned __int64, #endif /* __x86_64__ */ -#if defined(__x86_64__) || defined(__arm__) +#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) + +static __inline__ +unsigned char _BitScanForward64(unsigned long *_Index, unsigned __int64 _Mask); +static __inline__ +unsigned char _BitScanReverse64(unsigned long *_Index, unsigned __int64 _Mask); static __inline__ __int64 _InterlockedDecrement64(__int64 volatile *_Addend); @@ -342,78 +326,6 @@ __int64 _InterlockedAnd64(__int64 volatile *_Value, __int64 _Mask); #endif /*----------------------------------------------------------------------------*\ -|* Bit Counting and Testing -\*----------------------------------------------------------------------------*/ -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittest(long const *_BitBase, long _BitPos) { - return (*_BitBase >> _BitPos) & 1; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittestandcomplement(long *_BitBase, long _BitPos) { - unsigned char _Res = (*_BitBase >> _BitPos) & 1; - *_BitBase = *_BitBase ^ (1 << _BitPos); - return _Res; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittestandreset(long *_BitBase, long _BitPos) { - unsigned char _Res = (*_BitBase >> _BitPos) & 1; - *_BitBase = *_BitBase & ~(1 << _BitPos); - return _Res; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittestandset(long *_BitBase, long _BitPos) { - unsigned char _Res = (*_BitBase >> _BitPos) & 1; - *_BitBase = *_BitBase | (1 << _BitPos); - return _Res; -} -#if defined(__arm__) || defined(__aarch64__) -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_interlockedbittestandset_acq(long volatile *_BitBase, long _BitPos) { - long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_ACQUIRE); - return (_PrevVal >> _BitPos) & 1; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_interlockedbittestandset_nf(long volatile *_BitBase, long _BitPos) { - long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELAXED); - return (_PrevVal >> _BitPos) & 1; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_interlockedbittestandset_rel(long volatile *_BitBase, long _BitPos) { - long _PrevVal = __atomic_fetch_or(_BitBase, 1l << _BitPos, __ATOMIC_RELEASE); - return (_PrevVal >> _BitPos) & 1; -} -#endif -#ifdef __x86_64__ -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittest64(__int64 const *_BitBase, __int64 _BitPos) { - return (*_BitBase >> _BitPos) & 1; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittestandcomplement64(__int64 *_BitBase, __int64 _BitPos) { - unsigned char _Res = (*_BitBase >> _BitPos) & 1; - *_BitBase = *_BitBase ^ (1ll << _BitPos); - return _Res; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittestandreset64(__int64 *_BitBase, __int64 _BitPos) { - unsigned char _Res = (*_BitBase >> _BitPos) & 1; - *_BitBase = *_BitBase & ~(1ll << _BitPos); - return _Res; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_bittestandset64(__int64 *_BitBase, __int64 _BitPos) { - unsigned char _Res = (*_BitBase >> _BitPos) & 1; - *_BitBase = *_BitBase | (1ll << _BitPos); - return _Res; -} -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_interlockedbittestandset64(__int64 volatile *_BitBase, __int64 _BitPos) { - long long _PrevVal = - __atomic_fetch_or(_BitBase, 1ll << _BitPos, __ATOMIC_SEQ_CST); - return (_PrevVal >> _BitPos) & 1; -} -#endif -/*----------------------------------------------------------------------------*\ |* Interlocked Exchange Add \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) @@ -602,6 +514,23 @@ _InterlockedAnd64_rel(__int64 volatile *_Value, __int64 _Mask) { } #endif /*----------------------------------------------------------------------------*\ +|* Bit Counting and Testing +\*----------------------------------------------------------------------------*/ +#if defined(__arm__) || defined(__aarch64__) +unsigned char _interlockedbittestandset_acq(long volatile *_BitBase, + long _BitPos); +unsigned char _interlockedbittestandset_nf(long volatile *_BitBase, + long _BitPos); +unsigned char _interlockedbittestandset_rel(long volatile *_BitBase, + long _BitPos); +unsigned char _interlockedbittestandreset_acq(long volatile *_BitBase, + long _BitPos); +unsigned char _interlockedbittestandreset_nf(long volatile *_BitBase, + long _BitPos); +unsigned char _interlockedbittestandreset_rel(long volatile *_BitBase, + long _BitPos); +#endif +/*----------------------------------------------------------------------------*\ |* Interlocked Or \*----------------------------------------------------------------------------*/ #if defined(__arm__) || defined(__aarch64__) @@ -868,33 +797,40 @@ _InterlockedCompareExchange64_rel(__int64 volatile *_Destination, #if defined(__i386__) || defined(__x86_64__) static __inline__ void __DEFAULT_FN_ATTRS __movsb(unsigned char *__dst, unsigned char const *__src, size_t __n) { - __asm__("rep movsb" : : "D"(__dst), "S"(__src), "c"(__n)); + __asm__ __volatile__("rep movsb" : "+D"(__dst), "+S"(__src), "+c"(__n) + : : "memory"); } static __inline__ void __DEFAULT_FN_ATTRS __movsd(unsigned long *__dst, unsigned long const *__src, size_t __n) { - __asm__("rep movsl" : : "D"(__dst), "S"(__src), "c"(__n)); + __asm__ __volatile__("rep movsl" : "+D"(__dst), "+S"(__src), "+c"(__n) + : : "memory"); } static __inline__ void __DEFAULT_FN_ATTRS __movsw(unsigned short *__dst, unsigned short const *__src, size_t __n) { - __asm__("rep movsw" : : "D"(__dst), "S"(__src), "c"(__n)); + __asm__ __volatile__("rep movsw" : "+D"(__dst), "+S"(__src), "+c"(__n) + : : "memory"); } static __inline__ void __DEFAULT_FN_ATTRS __stosd(unsigned long *__dst, unsigned long __x, size_t __n) { - __asm__("rep stosl" : : "D"(__dst), "a"(__x), "c"(__n)); + __asm__ __volatile__("rep stosl" : "+D"(__dst), "+c"(__n) : "a"(__x) + : "memory"); } static __inline__ void __DEFAULT_FN_ATTRS __stosw(unsigned short *__dst, unsigned short __x, size_t __n) { - __asm__("rep stosw" : : "D"(__dst), "a"(__x), "c"(__n)); + __asm__ __volatile__("rep stosw" : "+D"(__dst), "+c"(__n) : "a"(__x) + : "memory"); } #endif #ifdef __x86_64__ static __inline__ void __DEFAULT_FN_ATTRS __movsq(unsigned long long *__dst, unsigned long long const *__src, size_t __n) { - __asm__("rep movsq" : : "D"(__dst), "S"(__src), "c"(__n)); + __asm__ __volatile__("rep movsq" : "+D"(__dst), "+S"(__src), "+c"(__n) + : : "memory"); } static __inline__ void __DEFAULT_FN_ATTRS __stosq(unsigned __int64 *__dst, unsigned __int64 __x, size_t __n) { - __asm__("rep stosq" : : "D"(__dst), "a"(__x), "c"(__n)); + __asm__ __volatile__("rep stosq" : "+D"(__dst), "+c"(__n) : "a"(__x) + : "memory"); } #endif @@ -927,6 +863,20 @@ __nop(void) { __asm__ volatile ("nop"); } #endif +#if defined(__x86_64__) +static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS +__shiftleft128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) { + unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l; + unsigned __int128 __res = __val << (__d & 63); + return (unsigned __int64)(__res >> 64); +} +static __inline__ unsigned __int64 __DEFAULT_FN_ATTRS +__shiftright128(unsigned __int64 __l, unsigned __int64 __h, unsigned char __d) { + unsigned __int128 __val = ((unsigned __int128)__h << 64) | __l; + unsigned __int128 __res = __val >> (__d & 63); + return (unsigned __int64)__res; +} +#endif /*----------------------------------------------------------------------------*\ |* Privileged intrinsics diff --git a/c_headers/invpcidintrin.h b/c_headers/invpcidintrin.h new file mode 100644 index 0000000000..c30a19fa3d --- /dev/null +++ b/c_headers/invpcidintrin.h @@ -0,0 +1,37 @@ +/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead." +#endif + +#ifndef __INVPCIDINTRIN_H +#define __INVPCIDINTRIN_H + +static __inline__ void + __attribute__((__always_inline__, __nodebug__, __target__("invpcid"))) +_invpcid(unsigned int __type, void *__descriptor) { + __builtin_ia32_invpcid(__type, __descriptor); +} + +#endif /* __INVPCIDINTRIN_H */ diff --git a/c_headers/lwpintrin.h b/c_headers/lwpintrin.h index c95fdd9a20..0b28d73582 100644 --- a/c_headers/lwpintrin.h +++ b/c_headers/lwpintrin.h @@ -31,7 +31,7 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp"))) -/// \brief Parses the LWPCB at the specified address and enables +/// Parses the LWPCB at the specified address and enables /// profiling if valid. /// /// \headerfile <x86intrin.h> @@ -48,7 +48,7 @@ __llwpcb (void *__addr) __builtin_ia32_llwpcb(__addr); } -/// \brief Flushes the LWP state to memory and returns the address of the LWPCB. +/// Flushes the LWP state to memory and returns the address of the LWPCB. /// /// \headerfile <x86intrin.h> /// @@ -58,12 +58,12 @@ __llwpcb (void *__addr) /// Address to the current Lightweight Profiling Control Block (LWPCB). /// If LWP is not currently enabled, returns NULL. static __inline__ void* __DEFAULT_FN_ATTRS -__slwpcb () +__slwpcb (void) { return __builtin_ia32_slwpcb(); } -/// \brief Inserts programmed event record into the LWP event ring buffer +/// Inserts programmed event record into the LWP event ring buffer /// and advances the ring buffer pointer. /// /// \headerfile <x86intrin.h> @@ -84,7 +84,7 @@ __slwpcb () (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \ (unsigned int) (FLAGS))) -/// \brief Decrements the LWP programmed value sample event counter. If the result is +/// Decrements the LWP programmed value sample event counter. If the result is /// negative, inserts an event record into the LWP event ring buffer in memory /// and advances the ring buffer pointer. /// @@ -104,7 +104,7 @@ __slwpcb () #ifdef __x86_64__ -/// \brief Inserts programmed event record into the LWP event ring buffer +/// Inserts programmed event record into the LWP event ring buffer /// and advances the ring buffer pointer. /// /// \headerfile <x86intrin.h> @@ -125,7 +125,7 @@ __slwpcb () (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ (unsigned int) (FLAGS))) -/// \brief Decrements the LWP programmed value sample event counter. If the result is +/// Decrements the LWP programmed value sample event counter. If the result is /// negative, inserts an event record into the LWP event ring buffer in memory /// and advances the ring buffer pointer. /// diff --git a/c_headers/lzcntintrin.h b/c_headers/lzcntintrin.h index 3d2769da3b..558f1828f0 100644 --- a/c_headers/lzcntintrin.h +++ b/c_headers/lzcntintrin.h @@ -31,7 +31,7 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) -/// \brief Counts the number of leading zero bits in the operand. +/// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -47,7 +47,7 @@ __lzcnt16(unsigned short __X) return __X ? __builtin_clzs(__X) : 16; } -/// \brief Counts the number of leading zero bits in the operand. +/// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -57,13 +57,14 @@ __lzcnt16(unsigned short __X) /// An unsigned 32-bit integer whose leading zeros are to be counted. /// \returns An unsigned 32-bit integer containing the number of leading zero /// bits in the operand. +/// \see _lzcnt_u32 static __inline__ unsigned int __DEFAULT_FN_ATTRS __lzcnt32(unsigned int __X) { return __X ? __builtin_clz(__X) : 32; } -/// \brief Counts the number of leading zero bits in the operand. +/// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -73,6 +74,7 @@ __lzcnt32(unsigned int __X) /// An unsigned 32-bit integer whose leading zeros are to be counted. /// \returns An unsigned 32-bit integer containing the number of leading zero /// bits in the operand. +/// \see __lzcnt32 static __inline__ unsigned int __DEFAULT_FN_ATTRS _lzcnt_u32(unsigned int __X) { @@ -80,7 +82,7 @@ _lzcnt_u32(unsigned int __X) } #ifdef __x86_64__ -/// \brief Counts the number of leading zero bits in the operand. +/// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -90,13 +92,14 @@ _lzcnt_u32(unsigned int __X) /// An unsigned 64-bit integer whose leading zeros are to be counted. /// \returns An unsigned 64-bit integer containing the number of leading zero /// bits in the operand. +/// \see _lzcnt_u64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS __lzcnt64(unsigned long long __X) { return __X ? __builtin_clzll(__X) : 64; } -/// \brief Counts the number of leading zero bits in the operand. +/// Counts the number of leading zero bits in the operand. /// /// \headerfile <x86intrin.h> /// @@ -106,6 +109,7 @@ __lzcnt64(unsigned long long __X) /// An unsigned 64-bit integer whose leading zeros are to be counted. /// \returns An unsigned 64-bit integer containing the number of leading zero /// bits in the operand. +/// \see __lzcnt64 static __inline__ unsigned long long __DEFAULT_FN_ATTRS _lzcnt_u64(unsigned long long __X) { diff --git a/c_headers/mm3dnow.h b/c_headers/mm3dnow.h index 294866c1dc..b0288757a3 100644 --- a/c_headers/mm3dnow.h +++ b/c_headers/mm3dnow.h @@ -30,9 +30,9 @@ typedef float __v2sf __attribute__((__vector_size__(8))); /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64))) -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow"))) _m_femms(void) { __builtin_ia32_femms(); } @@ -134,7 +134,7 @@ _m_pmulhrw(__m64 __m1, __m64 __m2) { /* Handle the 3dnowa instructions here. */ #undef __DEFAULT_FN_ATTRS -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64))) static __inline__ __m64 __DEFAULT_FN_ATTRS _m_pf2iw(__m64 __m) { diff --git a/c_headers/mmintrin.h b/c_headers/mmintrin.h index 4b38d51713..a73539942a 100644 --- a/c_headers/mmintrin.h +++ b/c_headers/mmintrin.h @@ -32,27 +32,27 @@ typedef short __v4hi __attribute__((__vector_size__(8))); typedef char __v8qi __attribute__((__vector_size__(8))); /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64))) -/// \brief Clears the MMX state by setting the state of the x87 stack registers +/// Clears the MMX state by setting the state of the x87 stack registers /// to empty. /// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> EMMS </c> instruction. /// -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) _mm_empty(void) { __builtin_ia32_emms(); } -/// \brief Constructs a 64-bit integer vector, setting the lower 32 bits to the +/// Constructs a 64-bit integer vector, setting the lower 32 bits to the /// value of the 32-bit integer parameter and setting the upper 32 bits to 0. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. +/// This intrinsic corresponds to the <c> MOVD </c> instruction. /// /// \param __i /// A 32-bit integer value. @@ -64,12 +64,12 @@ _mm_cvtsi32_si64(int __i) return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); } -/// \brief Returns the lower 32 bits of a 64-bit integer vector as a 32-bit +/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit /// signed integer. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. +/// This intrinsic corresponds to the <c> MOVD </c> instruction. /// /// \param __m /// A 64-bit integer vector. @@ -81,11 +81,11 @@ _mm_cvtsi64_si32(__m64 __m) return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); } -/// \brief Casts a 64-bit signed integer value into a 64-bit integer vector. +/// Casts a 64-bit signed integer value into a 64-bit integer vector. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction. +/// This intrinsic corresponds to the <c> MOVQ </c> instruction. /// /// \param __i /// A 64-bit signed integer. @@ -97,11 +97,11 @@ _mm_cvtsi64_m64(long long __i) return (__m64)__i; } -/// \brief Casts a 64-bit integer vector into a 64-bit signed integer value. +/// Casts a 64-bit integer vector into a 64-bit signed integer value. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVQ / MOVD </c> instruction. +/// This intrinsic corresponds to the <c> MOVQ </c> instruction. /// /// \param __m /// A 64-bit integer vector. @@ -113,7 +113,7 @@ _mm_cvtm64_si64(__m64 __m) return (long long)__m; } -/// \brief Converts 16-bit signed integers from both 64-bit integer vector +/// Converts 16-bit signed integers from both 64-bit integer vector /// parameters of [4 x i16] into 8-bit signed integer values, and constructs /// a 64-bit integer vector of [8 x i8] as the result. Positive values /// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80 @@ -143,7 +143,7 @@ _mm_packs_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Converts 32-bit signed integers from both 64-bit integer vector +/// Converts 32-bit signed integers from both 64-bit integer vector /// parameters of [2 x i32] into 16-bit signed integer values, and constructs /// a 64-bit integer vector of [4 x i16] as the result. Positive values /// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than @@ -173,7 +173,7 @@ _mm_packs_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); } -/// \brief Converts 16-bit signed integers from both 64-bit integer vector +/// Converts 16-bit signed integers from both 64-bit integer vector /// parameters of [4 x i16] into 8-bit unsigned integer values, and /// constructs a 64-bit integer vector of [8 x i8] as the result. Values /// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated @@ -203,7 +203,7 @@ _mm_packs_pu16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] +/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] /// and interleaves them into a 64-bit integer vector of [8 x i8]. /// /// \headerfile <x86intrin.h> @@ -230,7 +230,7 @@ _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of +/// Unpacks the upper 32 bits from two 64-bit integer vectors of /// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16]. /// /// \headerfile <x86intrin.h> @@ -253,7 +253,7 @@ _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Unpacks the upper 32 bits from two 64-bit integer vectors of +/// Unpacks the upper 32 bits from two 64-bit integer vectors of /// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32]. /// /// \headerfile <x86intrin.h> @@ -274,7 +274,7 @@ _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); } -/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] +/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] /// and interleaves them into a 64-bit integer vector of [8 x i8]. /// /// \headerfile <x86intrin.h> @@ -301,7 +301,7 @@ _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of +/// Unpacks the lower 32 bits from two 64-bit integer vectors of /// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16]. /// /// \headerfile <x86intrin.h> @@ -324,7 +324,7 @@ _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Unpacks the lower 32 bits from two 64-bit integer vectors of +/// Unpacks the lower 32 bits from two 64-bit integer vectors of /// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32]. /// /// \headerfile <x86intrin.h> @@ -345,7 +345,7 @@ _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); } -/// \brief Adds each 8-bit integer element of the first 64-bit integer vector +/// Adds each 8-bit integer element of the first 64-bit integer vector /// of [8 x i8] to the corresponding 8-bit integer element of the second /// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are /// packed into a 64-bit integer vector of [8 x i8]. @@ -366,7 +366,7 @@ _mm_add_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Adds each 16-bit integer element of the first 64-bit integer vector +/// Adds each 16-bit integer element of the first 64-bit integer vector /// of [4 x i16] to the corresponding 16-bit integer element of the second /// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are /// packed into a 64-bit integer vector of [4 x i16]. @@ -387,7 +387,7 @@ _mm_add_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Adds each 32-bit integer element of the first 64-bit integer vector +/// Adds each 32-bit integer element of the first 64-bit integer vector /// of [2 x i32] to the corresponding 32-bit integer element of the second /// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are /// packed into a 64-bit integer vector of [2 x i32]. @@ -408,7 +408,7 @@ _mm_add_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); } -/// \brief Adds each 8-bit signed integer element of the first 64-bit integer +/// Adds each 8-bit signed integer element of the first 64-bit integer /// vector of [8 x i8] to the corresponding 8-bit signed integer element of /// the second 64-bit integer vector of [8 x i8]. Positive sums greater than /// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to @@ -430,7 +430,7 @@ _mm_adds_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Adds each 16-bit signed integer element of the first 64-bit integer +/// Adds each 16-bit signed integer element of the first 64-bit integer /// vector of [4 x i16] to the corresponding 16-bit signed integer element of /// the second 64-bit integer vector of [4 x i16]. Positive sums greater than /// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are @@ -453,7 +453,7 @@ _mm_adds_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Adds each 8-bit unsigned integer element of the first 64-bit integer +/// Adds each 8-bit unsigned integer element of the first 64-bit integer /// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of /// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are /// saturated to 0xFF. The results are packed into a 64-bit integer vector of @@ -475,7 +475,7 @@ _mm_adds_pu8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Adds each 16-bit unsigned integer element of the first 64-bit integer +/// Adds each 16-bit unsigned integer element of the first 64-bit integer /// vector of [4 x i16] to the corresponding 16-bit unsigned integer element /// of the second 64-bit integer vector of [4 x i16]. Sums greater than /// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit @@ -497,7 +497,7 @@ _mm_adds_pu16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Subtracts each 8-bit integer element of the second 64-bit integer +/// Subtracts each 8-bit integer element of the second 64-bit integer /// vector of [8 x i8] from the corresponding 8-bit integer element of the /// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results /// are packed into a 64-bit integer vector of [8 x i8]. @@ -518,7 +518,7 @@ _mm_sub_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Subtracts each 16-bit integer element of the second 64-bit integer +/// Subtracts each 16-bit integer element of the second 64-bit integer /// vector of [4 x i16] from the corresponding 16-bit integer element of the /// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the /// results are packed into a 64-bit integer vector of [4 x i16]. @@ -539,7 +539,7 @@ _mm_sub_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Subtracts each 32-bit integer element of the second 64-bit integer +/// Subtracts each 32-bit integer element of the second 64-bit integer /// vector of [2 x i32] from the corresponding 32-bit integer element of the /// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the /// results are packed into a 64-bit integer vector of [2 x i32]. @@ -560,7 +560,7 @@ _mm_sub_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); } -/// \brief Subtracts each 8-bit signed integer element of the second 64-bit +/// Subtracts each 8-bit signed integer element of the second 64-bit /// integer vector of [8 x i8] from the corresponding 8-bit signed integer /// element of the first 64-bit integer vector of [8 x i8]. Positive results /// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80 @@ -583,7 +583,7 @@ _mm_subs_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Subtracts each 16-bit signed integer element of the second 64-bit +/// Subtracts each 16-bit signed integer element of the second 64-bit /// integer vector of [4 x i16] from the corresponding 16-bit signed integer /// element of the first 64-bit integer vector of [4 x i16]. Positive results /// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than @@ -606,7 +606,7 @@ _mm_subs_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Subtracts each 8-bit unsigned integer element of the second 64-bit +/// Subtracts each 8-bit unsigned integer element of the second 64-bit /// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer /// element of the first 64-bit integer vector of [8 x i8]. /// @@ -630,7 +630,7 @@ _mm_subs_pu8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Subtracts each 16-bit unsigned integer element of the second 64-bit +/// Subtracts each 16-bit unsigned integer element of the second 64-bit /// integer vector of [4 x i16] from the corresponding 16-bit unsigned /// integer element of the first 64-bit integer vector of [4 x i16]. /// @@ -654,7 +654,7 @@ _mm_subs_pu16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Multiplies each 16-bit signed integer element of the first 64-bit +/// Multiplies each 16-bit signed integer element of the first 64-bit /// integer vector of [4 x i16] by the corresponding 16-bit signed integer /// element of the second 64-bit integer vector of [4 x i16] and get four /// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums. @@ -681,7 +681,7 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Multiplies each 16-bit signed integer element of the first 64-bit +/// Multiplies each 16-bit signed integer element of the first 64-bit /// integer vector of [4 x i16] by the corresponding 16-bit signed integer /// element of the second 64-bit integer vector of [4 x i16]. Packs the upper /// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16]. @@ -702,7 +702,7 @@ _mm_mulhi_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Multiplies each 16-bit signed integer element of the first 64-bit +/// Multiplies each 16-bit signed integer element of the first 64-bit /// integer vector of [4 x i16] by the corresponding 16-bit signed integer /// element of the second 64-bit integer vector of [4 x i16]. Packs the lower /// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16]. @@ -723,7 +723,7 @@ _mm_mullo_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Left-shifts each 16-bit signed integer element of the first +/// Left-shifts each 16-bit signed integer element of the first /// parameter, which is a 64-bit integer vector of [4 x i16], by the number /// of bits specified by the second parameter, which is a 64-bit integer. The /// lower 16 bits of the results are packed into a 64-bit integer vector of @@ -746,7 +746,7 @@ _mm_sll_pi16(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); } -/// \brief Left-shifts each 16-bit signed integer element of a 64-bit integer +/// Left-shifts each 16-bit signed integer element of a 64-bit integer /// vector of [4 x i16] by the number of bits specified by a 32-bit integer. /// The lower 16 bits of the results are packed into a 64-bit integer vector /// of [4 x i16]. @@ -768,7 +768,7 @@ _mm_slli_pi16(__m64 __m, int __count) return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); } -/// \brief Left-shifts each 32-bit signed integer element of the first +/// Left-shifts each 32-bit signed integer element of the first /// parameter, which is a 64-bit integer vector of [2 x i32], by the number /// of bits specified by the second parameter, which is a 64-bit integer. The /// lower 32 bits of the results are packed into a 64-bit integer vector of @@ -791,7 +791,7 @@ _mm_sll_pi32(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); } -/// \brief Left-shifts each 32-bit signed integer element of a 64-bit integer +/// Left-shifts each 32-bit signed integer element of a 64-bit integer /// vector of [2 x i32] by the number of bits specified by a 32-bit integer. /// The lower 32 bits of the results are packed into a 64-bit integer vector /// of [2 x i32]. @@ -813,7 +813,7 @@ _mm_slli_pi32(__m64 __m, int __count) return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); } -/// \brief Left-shifts the first 64-bit integer parameter by the number of bits +/// Left-shifts the first 64-bit integer parameter by the number of bits /// specified by the second 64-bit integer parameter. The lower 64 bits of /// result are returned. /// @@ -833,7 +833,7 @@ _mm_sll_si64(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); } -/// \brief Left-shifts the first parameter, which is a 64-bit integer, by the +/// Left-shifts the first parameter, which is a 64-bit integer, by the /// number of bits specified by the second parameter, which is a 32-bit /// integer. The lower 64 bits of result are returned. /// @@ -853,7 +853,7 @@ _mm_slli_si64(__m64 __m, int __count) return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); } -/// \brief Right-shifts each 16-bit integer element of the first parameter, +/// Right-shifts each 16-bit integer element of the first parameter, /// which is a 64-bit integer vector of [4 x i16], by the number of bits /// specified by the second parameter, which is a 64-bit integer. /// @@ -877,7 +877,7 @@ _mm_sra_pi16(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); } -/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector +/// Right-shifts each 16-bit integer element of a 64-bit integer vector /// of [4 x i16] by the number of bits specified by a 32-bit integer. /// /// High-order bits are filled with the sign bit of the initial value of each @@ -900,7 +900,7 @@ _mm_srai_pi16(__m64 __m, int __count) return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); } -/// \brief Right-shifts each 32-bit integer element of the first parameter, +/// Right-shifts each 32-bit integer element of the first parameter, /// which is a 64-bit integer vector of [2 x i32], by the number of bits /// specified by the second parameter, which is a 64-bit integer. /// @@ -924,7 +924,7 @@ _mm_sra_pi32(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); } -/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector +/// Right-shifts each 32-bit integer element of a 64-bit integer vector /// of [2 x i32] by the number of bits specified by a 32-bit integer. /// /// High-order bits are filled with the sign bit of the initial value of each @@ -947,7 +947,7 @@ _mm_srai_pi32(__m64 __m, int __count) return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); } -/// \brief Right-shifts each 16-bit integer element of the first parameter, +/// Right-shifts each 16-bit integer element of the first parameter, /// which is a 64-bit integer vector of [4 x i16], by the number of bits /// specified by the second parameter, which is a 64-bit integer. /// @@ -970,7 +970,7 @@ _mm_srl_pi16(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); } -/// \brief Right-shifts each 16-bit integer element of a 64-bit integer vector +/// Right-shifts each 16-bit integer element of a 64-bit integer vector /// of [4 x i16] by the number of bits specified by a 32-bit integer. /// /// High-order bits are cleared. The 16-bit results are packed into a 64-bit @@ -992,7 +992,7 @@ _mm_srli_pi16(__m64 __m, int __count) return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); } -/// \brief Right-shifts each 32-bit integer element of the first parameter, +/// Right-shifts each 32-bit integer element of the first parameter, /// which is a 64-bit integer vector of [2 x i32], by the number of bits /// specified by the second parameter, which is a 64-bit integer. /// @@ -1015,7 +1015,7 @@ _mm_srl_pi32(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); } -/// \brief Right-shifts each 32-bit integer element of a 64-bit integer vector +/// Right-shifts each 32-bit integer element of a 64-bit integer vector /// of [2 x i32] by the number of bits specified by a 32-bit integer. /// /// High-order bits are cleared. The 32-bit results are packed into a 64-bit @@ -1037,7 +1037,7 @@ _mm_srli_pi32(__m64 __m, int __count) return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); } -/// \brief Right-shifts the first 64-bit integer parameter by the number of bits +/// Right-shifts the first 64-bit integer parameter by the number of bits /// specified by the second 64-bit integer parameter. /// /// High-order bits are cleared. @@ -1057,7 +1057,7 @@ _mm_srl_si64(__m64 __m, __m64 __count) return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); } -/// \brief Right-shifts the first parameter, which is a 64-bit integer, by the +/// Right-shifts the first parameter, which is a 64-bit integer, by the /// number of bits specified by the second parameter, which is a 32-bit /// integer. /// @@ -1078,7 +1078,7 @@ _mm_srli_si64(__m64 __m, int __count) return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); } -/// \brief Performs a bitwise AND of two 64-bit integer vectors. +/// Performs a bitwise AND of two 64-bit integer vectors. /// /// \headerfile <x86intrin.h> /// @@ -1096,7 +1096,7 @@ _mm_and_si64(__m64 __m1, __m64 __m2) return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); } -/// \brief Performs a bitwise NOT of the first 64-bit integer vector, and then +/// Performs a bitwise NOT of the first 64-bit integer vector, and then /// performs a bitwise AND of the intermediate result and the second 64-bit /// integer vector. /// @@ -1117,7 +1117,7 @@ _mm_andnot_si64(__m64 __m1, __m64 __m2) return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); } -/// \brief Performs a bitwise OR of two 64-bit integer vectors. +/// Performs a bitwise OR of two 64-bit integer vectors. /// /// \headerfile <x86intrin.h> /// @@ -1135,7 +1135,7 @@ _mm_or_si64(__m64 __m1, __m64 __m2) return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); } -/// \brief Performs a bitwise exclusive OR of two 64-bit integer vectors. +/// Performs a bitwise exclusive OR of two 64-bit integer vectors. /// /// \headerfile <x86intrin.h> /// @@ -1153,7 +1153,7 @@ _mm_xor_si64(__m64 __m1, __m64 __m2) return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); } -/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of +/// Compares the 8-bit integer elements of two 64-bit integer vectors of /// [8 x i8] to determine if the element of the first vector is equal to the /// corresponding element of the second vector. /// @@ -1175,7 +1175,7 @@ _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of +/// Compares the 16-bit integer elements of two 64-bit integer vectors of /// [4 x i16] to determine if the element of the first vector is equal to the /// corresponding element of the second vector. /// @@ -1197,7 +1197,7 @@ _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of +/// Compares the 32-bit integer elements of two 64-bit integer vectors of /// [2 x i32] to determine if the element of the first vector is equal to the /// corresponding element of the second vector. /// @@ -1219,7 +1219,7 @@ _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); } -/// \brief Compares the 8-bit integer elements of two 64-bit integer vectors of +/// Compares the 8-bit integer elements of two 64-bit integer vectors of /// [8 x i8] to determine if the element of the first vector is greater than /// the corresponding element of the second vector. /// @@ -1241,7 +1241,7 @@ _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); } -/// \brief Compares the 16-bit integer elements of two 64-bit integer vectors of +/// Compares the 16-bit integer elements of two 64-bit integer vectors of /// [4 x i16] to determine if the element of the first vector is greater than /// the corresponding element of the second vector. /// @@ -1263,7 +1263,7 @@ _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); } -/// \brief Compares the 32-bit integer elements of two 64-bit integer vectors of +/// Compares the 32-bit integer elements of two 64-bit integer vectors of /// [2 x i32] to determine if the element of the first vector is greater than /// the corresponding element of the second vector. /// @@ -1285,20 +1285,20 @@ _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); } -/// \brief Constructs a 64-bit integer vector initialized to zero. +/// Constructs a 64-bit integer vector initialized to zero. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. +/// This intrinsic corresponds to the <c> PXOR </c> instruction. /// /// \returns An initialized 64-bit integer vector with all elements set to zero. static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_setzero_si64(void) { - return (__m64){ 0LL }; + return __extension__ (__m64){ 0LL }; } -/// \brief Constructs a 64-bit integer vector initialized with the specified +/// Constructs a 64-bit integer vector initialized with the specified /// 32-bit integer values. /// /// \headerfile <x86intrin.h> @@ -1319,7 +1319,7 @@ _mm_set_pi32(int __i1, int __i0) return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); } -/// \brief Constructs a 64-bit integer vector initialized with the specified +/// Constructs a 64-bit integer vector initialized with the specified /// 16-bit integer values. /// /// \headerfile <x86intrin.h> @@ -1342,7 +1342,7 @@ _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); } -/// \brief Constructs a 64-bit integer vector initialized with the specified +/// Constructs a 64-bit integer vector initialized with the specified /// 8-bit integer values. /// /// \headerfile <x86intrin.h> @@ -1375,13 +1375,14 @@ _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, __b4, __b5, __b6, __b7); } -/// \brief Constructs a 64-bit integer vector of [2 x i32], with each of the +/// Constructs a 64-bit integer vector of [2 x i32], with each of the /// 32-bit integer vector elements set to the specified 32-bit integer /// value. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. /// /// \param __i /// A 32-bit integer value used to initialize each vector element of the @@ -1393,13 +1394,14 @@ _mm_set1_pi32(int __i) return _mm_set_pi32(__i, __i); } -/// \brief Constructs a 64-bit integer vector of [4 x i16], with each of the +/// Constructs a 64-bit integer vector of [4 x i16], with each of the /// 16-bit integer vector elements set to the specified 16-bit integer /// value. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. /// /// \param __w /// A 16-bit integer value used to initialize each vector element of the @@ -1411,13 +1413,13 @@ _mm_set1_pi16(short __w) return _mm_set_pi16(__w, __w, __w, __w); } -/// \brief Constructs a 64-bit integer vector of [8 x i8], with each of the +/// Constructs a 64-bit integer vector of [8 x i8], with each of the /// 8-bit integer vector elements set to the specified 8-bit integer value. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VPUNPCKLBW + VPSHUFLW / PUNPCKLBW + -/// PSHUFLW </c> instruction. +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. /// /// \param __b /// An 8-bit integer value used to initialize each vector element of the @@ -1429,7 +1431,7 @@ _mm_set1_pi8(char __b) return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); } -/// \brief Constructs a 64-bit integer vector, initialized in reverse order with +/// Constructs a 64-bit integer vector, initialized in reverse order with /// the specified 32-bit integer values. /// /// \headerfile <x86intrin.h> @@ -1450,7 +1452,7 @@ _mm_setr_pi32(int __i0, int __i1) return _mm_set_pi32(__i1, __i0); } -/// \brief Constructs a 64-bit integer vector, initialized in reverse order with +/// Constructs a 64-bit integer vector, initialized in reverse order with /// the specified 16-bit integer values. /// /// \headerfile <x86intrin.h> @@ -1473,7 +1475,7 @@ _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) return _mm_set_pi16(__w3, __w2, __w1, __w0); } -/// \brief Constructs a 64-bit integer vector, initialized in reverse order with +/// Constructs a 64-bit integer vector, initialized in reverse order with /// the specified 8-bit integer values. /// /// \headerfile <x86intrin.h> diff --git a/c_headers/module.modulemap b/c_headers/module.modulemap index 95d26cefa6..1d1af57fd0 100644 --- a/c_headers/module.modulemap +++ b/c_headers/module.modulemap @@ -38,6 +38,7 @@ module _Builtin_intrinsics [system] [extern_c] { explicit module neon { requires neon header "arm_neon.h" + header "arm_fp16.h" export * } } @@ -62,6 +63,17 @@ module _Builtin_intrinsics [system] [extern_c] { textual header "fma4intrin.h" textual header "mwaitxintrin.h" textual header "clzerointrin.h" + textual header "wbnoinvdintrin.h" + textual header "cldemoteintrin.h" + textual header "waitpkgintrin.h" + textual header "movdirintrin.h" + textual header "pconfigintrin.h" + textual header "sgxintrin.h" + textual header "ptwriteintrin.h" + textual header "invpcidintrin.h" + + textual header "__wmmintrin_aes.h" + textual header "__wmmintrin_pclmul.h" explicit module mm_malloc { requires !freestanding @@ -128,14 +140,6 @@ module _Builtin_intrinsics [system] [extern_c] { export aes export pclmul } - - explicit module aes { - header "__wmmintrin_aes.h" - } - - explicit module pclmul { - header "__wmmintrin_pclmul.h" - } } explicit module systemz { diff --git a/c_headers/movdirintrin.h b/c_headers/movdirintrin.h new file mode 100644 index 0000000000..ec20c53709 --- /dev/null +++ b/c_headers/movdirintrin.h @@ -0,0 +1,63 @@ +/*===------------------------- movdirintrin.h ------------------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef _MOVDIRINTRIN_H +#define _MOVDIRINTRIN_H + +/* Move doubleword as direct store */ +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("movdiri"))) +_directstoreu_u32 (void *__dst, unsigned int __value) +{ + __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value); +} + +#ifdef __x86_64__ + +/* Move quadword as direct store */ +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("movdiri"))) +_directstoreu_u64 (void *__dst, unsigned long __value) +{ + __builtin_ia32_directstore_u64((unsigned long *)__dst, __value); +} + +#endif /* __x86_64__ */ + +/* + * movdir64b - Move 64 bytes as direct store. + * The destination must be 64 byte aligned, and the store is atomic. + * The source address has no alignment requirement, and the load from + * the source address is not atomic. + */ +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("movdir64b"))) +_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src) +{ + __builtin_ia32_movdir64b(__dst, __src); +} + +#endif /* _MOVDIRINTRIN_H */ diff --git a/c_headers/mwaitxintrin.h b/c_headers/mwaitxintrin.h index 635f2ac6ca..2921eadfa5 100644 --- a/c_headers/mwaitxintrin.h +++ b/c_headers/mwaitxintrin.h @@ -25,8 +25,8 @@ #error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead." #endif -#ifndef _MWAITXINTRIN_H -#define _MWAITXINTRIN_H +#ifndef __MWAITXINTRIN_H +#define __MWAITXINTRIN_H /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx"))) @@ -44,4 +44,4 @@ _mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock) #undef __DEFAULT_FN_ATTRS -#endif /* _MWAITXINTRIN_H */ +#endif /* __MWAITXINTRIN_H */ diff --git a/c_headers/nmmintrin.h b/c_headers/nmmintrin.h index 57fec15963..348fb8c7c1 100644 --- a/c_headers/nmmintrin.h +++ b/c_headers/nmmintrin.h @@ -21,10 +21,10 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _NMMINTRIN_H -#define _NMMINTRIN_H +#ifndef __NMMINTRIN_H +#define __NMMINTRIN_H /* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h, just include it now then. */ #include <smmintrin.h> -#endif /* _NMMINTRIN_H */ +#endif /* __NMMINTRIN_H */ diff --git a/c_headers/opencl-c.h b/c_headers/opencl-c.h index ce204b04c0..e481c792df 100644 --- a/c_headers/opencl-c.h +++ b/c_headers/opencl-c.h @@ -11540,7 +11540,7 @@ half16 __ovld __cnfn select(half16 a, half16 b, ushort16 c); * * vstoren write sizeof (gentypen) bytes given by data to address (p + (offset * n)). * - * The address computed as (p + (offset * n)) must be + * The address computed as (p + (offset * n)) must be * 8-bit aligned if gentype is char, uchar; * 16-bit aligned if gentype is short, ushort, half; * 32-bit aligned if gentype is int, uint, float; @@ -12862,7 +12862,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags); * Read memory barrier that orders only * loads. * The flags argument specifies the memory - * address space and can be set to to a + * address space and can be set to a * combination of the following literal * values: * CLK_LOCAL_MEM_FENCE @@ -12874,7 +12874,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags); * Write memory barrier that orders only * stores. * The flags argument specifies the memory - * address space and can be set to to a + * address space and can be set to a * combination of the following literal * values: * CLK_LOCAL_MEM_FENCE @@ -12888,7 +12888,7 @@ void __ovld write_mem_fence(cl_mem_fence_flags flags); cl_mem_fence_flags __ovld get_fence(const void *ptr); cl_mem_fence_flags __ovld get_fence(void *ptr); -/** +/** * Builtin functions to_global, to_local, and to_private need to be declared as Clang builtin functions * and checked in Sema since they should be declared as * addr gentype* to_addr (gentype*); @@ -13773,7 +13773,7 @@ ulong __ovld atomic_fetch_max_explicit(volatile atomic_ulong *object, long opera // add/sub: atomic type argument can be uintptr_t/intptr_t, value type argument can be ptrdiff_t. // or/xor/and/min/max: atomic type argument can be intptr_t/uintptr_t, value type argument can be intptr_t/uintptr_t. -#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) +#if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics) uintptr_t __ovld atomic_fetch_add(volatile atomic_uintptr_t *object, ptrdiff_t operand); uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order); uintptr_t __ovld atomic_fetch_add_explicit(volatile atomic_uintptr_t *object, ptrdiff_t operand, memory_order order, memory_scope scope); @@ -14571,7 +14571,7 @@ int printf(__constant const char* st, ...); * only. The filter_mode specified in sampler * must be set to CLK_FILTER_NEAREST; otherwise * the values returned are undefined. - + * The read_image{f|i|ui} calls that take * integer coordinates must use a sampler with * normalized coordinates set to @@ -15421,8 +15421,8 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_dept #define CLK_DEPTH_STENCIL 0x10BE #if __OPENCL_C_VERSION__ >= CL_VERSION_2_0 #define CLK_sRGB 0x10BF -#define CLK_sRGBA 0x10C1 #define CLK_sRGBx 0x10C0 +#define CLK_sRGBA 0x10C1 #define CLK_sBGRA 0x10C2 #define CLK_ABGR 0x10C3 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0 diff --git a/c_headers/pconfigintrin.h b/c_headers/pconfigintrin.h new file mode 100644 index 0000000000..fee3cad388 --- /dev/null +++ b/c_headers/pconfigintrin.h @@ -0,0 +1,50 @@ +/*===---- pconfigintrin.h - X86 platform configuration ---------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __PCONFIGINTRIN_H +#define __PCONFIGINTRIN_H + +#define __PCONFIG_KEY_PROGRAM 0x00000001 + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("pconfig"))) + +static __inline unsigned int __DEFAULT_FN_ATTRS +_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("pconfig" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/pkuintrin.h b/c_headers/pkuintrin.h index 9e5459450b..6976924d82 100644 --- a/c_headers/pkuintrin.h +++ b/c_headers/pkuintrin.h @@ -1,4 +1,4 @@ -/*===------------- pkuintrin.h - PKU intrinsics ------------------=== +/*===---- pkuintrin.h - PKU intrinsics -------------------------------------=== * * * Permission is hereby granted, free of charge, to any person obtaining a copy @@ -40,7 +40,7 @@ _rdpkru_u32(void) static __inline__ void __DEFAULT_FN_ATTRS _wrpkru(unsigned int __val) { - return __builtin_ia32_wrpkru(__val); + __builtin_ia32_wrpkru(__val); } #undef __DEFAULT_FN_ATTRS diff --git a/c_headers/pmmintrin.h b/c_headers/pmmintrin.h index 7ec08a1bcb..7e1a9eae59 100644 --- a/c_headers/pmmintrin.h +++ b/c_headers/pmmintrin.h @@ -28,9 +28,9 @@ /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) + __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128))) -/// \brief Loads data from an unaligned memory location to elements in a 128-bit +/// Loads data from an unaligned memory location to elements in a 128-bit /// vector. /// /// If the address of the data is not 16-byte aligned, the instruction may @@ -50,7 +50,7 @@ _mm_lddqu_si128(__m128i const *__p) return (__m128i)__builtin_ia32_lddqu((char const *)__p); } -/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// Adds the even-indexed values and subtracts the odd-indexed values of /// two 128-bit vectors of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -69,7 +69,7 @@ _mm_addsub_ps(__m128 __a, __m128 __b) return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in two +/// Horizontally adds the adjacent pairs of values contained in two /// 128-bit vectors of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -92,7 +92,7 @@ _mm_hadd_ps(__m128 __a, __m128 __b) return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in two +/// Horizontally subtracts the adjacent pairs of values contained in two /// 128-bit vectors of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -115,7 +115,7 @@ _mm_hsub_ps(__m128 __a, __m128 __b) return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); } -/// \brief Moves and duplicates odd-indexed values from a 128-bit vector +/// Moves and duplicates odd-indexed values from a 128-bit vector /// of [4 x float] to float values stored in a 128-bit vector of /// [4 x float]. /// @@ -137,7 +137,7 @@ _mm_movehdup_ps(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); } -/// \brief Duplicates even-indexed values from a 128-bit vector of +/// Duplicates even-indexed values from a 128-bit vector of /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -158,7 +158,7 @@ _mm_moveldup_ps(__m128 __a) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); } -/// \brief Adds the even-indexed values and subtracts the odd-indexed values of +/// Adds the even-indexed values and subtracts the odd-indexed values of /// two 128-bit vectors of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -177,7 +177,7 @@ _mm_addsub_pd(__m128d __a, __m128d __b) return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); } -/// \brief Horizontally adds the pairs of values contained in two 128-bit +/// Horizontally adds the pairs of values contained in two 128-bit /// vectors of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -200,7 +200,7 @@ _mm_hadd_pd(__m128d __a, __m128d __b) return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); } -/// \brief Horizontally subtracts the pairs of values contained in two 128-bit +/// Horizontally subtracts the pairs of values contained in two 128-bit /// vectors of [2 x double]. /// /// \headerfile <x86intrin.h> @@ -223,13 +223,13 @@ _mm_hsub_pd(__m128d __a, __m128d __b) return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); } -/// \brief Moves and duplicates one double-precision value to double-precision +/// Moves and duplicates one double-precision value to double-precision /// values stored in a 128-bit vector of [2 x double]. /// /// \headerfile <x86intrin.h> /// /// \code -/// __m128d _mm_loaddup_pd(double const * dp); +/// __m128d _mm_loaddup_pd(double const *dp); /// \endcode /// /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. @@ -240,7 +240,7 @@ _mm_hsub_pd(__m128d __a, __m128d __b) /// duplicated values. #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) -/// \brief Moves and duplicates the double-precision value in the lower bits of +/// Moves and duplicates the double-precision value in the lower bits of /// a 128-bit vector of [2 x double] to double-precision values stored in a /// 128-bit vector of [2 x double]. /// @@ -259,7 +259,7 @@ _mm_movedup_pd(__m128d __a) return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); } -/// \brief Establishes a linear address memory range to be monitored and puts +/// Establishes a linear address memory range to be monitored and puts /// the processor in the monitor event pending state. Data stored in the /// monitored address range causes the processor to exit the pending state. /// @@ -280,7 +280,7 @@ _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) __builtin_ia32_monitor((void *)__p, __extensions, __hints); } -/// \brief Used with the MONITOR instruction to wait while the processor is in +/// Used with the MONITOR instruction to wait while the processor is in /// the monitor event pending state. Data stored in the monitored address /// range causes the processor to exit the pending state. /// diff --git a/c_headers/popcntintrin.h b/c_headers/popcntintrin.h index 0b4793e58b..75ceab9e15 100644 --- a/c_headers/popcntintrin.h +++ b/c_headers/popcntintrin.h @@ -21,13 +21,13 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _POPCNTINTRIN_H -#define _POPCNTINTRIN_H +#ifndef __POPCNTINTRIN_H +#define __POPCNTINTRIN_H /* Define the default attributes for the functions in this file. */ #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) -/// \brief Counts the number of bits in the source operand having a value of 1. +/// Counts the number of bits in the source operand having a value of 1. /// /// \headerfile <x86intrin.h> /// @@ -43,7 +43,7 @@ _mm_popcnt_u32(unsigned int __A) return __builtin_popcount(__A); } -/// \brief Counts the number of bits in the source operand having a value of 1. +/// Counts the number of bits in the source operand having a value of 1. /// /// \headerfile <x86intrin.h> /// @@ -60,7 +60,7 @@ _popcnt32(int __A) } #ifdef __x86_64__ -/// \brief Counts the number of bits in the source operand having a value of 1. +/// Counts the number of bits in the source operand having a value of 1. /// /// \headerfile <x86intrin.h> /// @@ -76,7 +76,7 @@ _mm_popcnt_u64(unsigned long long __A) return __builtin_popcountll(__A); } -/// \brief Counts the number of bits in the source operand having a value of 1. +/// Counts the number of bits in the source operand having a value of 1. /// /// \headerfile <x86intrin.h> /// @@ -95,4 +95,4 @@ _popcnt64(long long __A) #undef __DEFAULT_FN_ATTRS -#endif /* _POPCNTINTRIN_H */ +#endif /* __POPCNTINTRIN_H */ diff --git a/c_headers/prfchwintrin.h b/c_headers/prfchwintrin.h index b52f31da27..70851396f4 100644 --- a/c_headers/prfchwintrin.h +++ b/c_headers/prfchwintrin.h @@ -28,8 +28,7 @@ #ifndef __PRFCHWINTRIN_H #define __PRFCHWINTRIN_H -#if defined(__PRFCHW__) || defined(__3dNOW__) -/// \brief Loads a memory sequence containing the specified memory address into +/// Loads a memory sequence containing the specified memory address into /// all data cache levels. The cache-coherency state is set to exclusive. /// Data can be read from and written to the cache line without additional /// delay. @@ -46,7 +45,7 @@ _m_prefetch(void *__P) __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); } -/// \brief Loads a memory sequence containing the specified memory address into +/// Loads a memory sequence containing the specified memory address into /// the L1 data cache and sets the cache-coherency to modified. This /// provides a hint to the processor that the cache line will be modified. /// It is intended for use when the cache line will be written to shortly @@ -66,6 +65,5 @@ _m_prefetchw(void *__P) { __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */); } -#endif #endif /* __PRFCHWINTRIN_H */ diff --git a/c_headers/ptwriteintrin.h b/c_headers/ptwriteintrin.h new file mode 100644 index 0000000000..1bb1df0a2e --- /dev/null +++ b/c_headers/ptwriteintrin.h @@ -0,0 +1,51 @@ +/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __PTWRITEINTRIN_H +#define __PTWRITEINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("ptwrite"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_ptwrite32(unsigned int __value) { + __builtin_ia32_ptwrite32(__value); +} + +#ifdef __x86_64__ + +static __inline__ void __DEFAULT_FN_ATTRS +_ptwrite64(unsigned long long __value) { + __builtin_ia32_ptwrite64(__value); +} + +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* __PTWRITEINTRIN_H */ diff --git a/c_headers/rdseedintrin.h b/c_headers/rdseedintrin.h index 421f4ea487..419466932c 100644 --- a/c_headers/rdseedintrin.h +++ b/c_headers/rdseedintrin.h @@ -21,7 +21,7 @@ *===-----------------------------------------------------------------------=== */ -#ifndef __X86INTRIN_H +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H #error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead." #endif diff --git a/c_headers/sgxintrin.h b/c_headers/sgxintrin.h new file mode 100644 index 0000000000..20aee76610 --- /dev/null +++ b/c_headers/sgxintrin.h @@ -0,0 +1,70 @@ +/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __SGXINTRIN_H +#define __SGXINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sgx"))) + +static __inline unsigned int __DEFAULT_FN_ATTRS +_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("enclu" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +static __inline unsigned int __DEFAULT_FN_ATTRS +_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("encls" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +static __inline unsigned int __DEFAULT_FN_ATTRS +_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("enclv" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/c_headers/shaintrin.h b/c_headers/shaintrin.h index 9b5d218008..3df4718ced 100644 --- a/c_headers/shaintrin.h +++ b/c_headers/shaintrin.h @@ -29,10 +29,10 @@ #define __SHAINTRIN_H /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128))) -#define _mm_sha1rnds4_epu32(V1, V2, M) __extension__ ({ \ - __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)); }) +#define _mm_sha1rnds4_epu32(V1, V2, M) \ + __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sha1nexte_epu32(__m128i __X, __m128i __Y) diff --git a/c_headers/smmintrin.h b/c_headers/smmintrin.h index e02775cea3..4806b3e4e1 100644 --- a/c_headers/smmintrin.h +++ b/c_headers/smmintrin.h @@ -21,13 +21,13 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _SMMINTRIN_H -#define _SMMINTRIN_H +#ifndef __SMMINTRIN_H +#define __SMMINTRIN_H #include <tmmintrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) /* SSE4 Rounding macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 @@ -46,7 +46,7 @@ #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) -/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an +/// Rounds up each element of the 128-bit vector of [4 x float] to an /// integer and returns the rounded values in a 128-bit vector of /// [4 x float]. /// @@ -63,7 +63,7 @@ /// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) -/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an +/// Rounds up each element of the 128-bit vector of [2 x double] to an /// integer and returns the rounded values in a 128-bit vector of /// [2 x double]. /// @@ -80,7 +80,7 @@ /// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) -/// \brief Copies three upper elements of the first 128-bit vector operand to +/// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of /// [4 x float]. Rounds up the lowest element of the second 128-bit vector /// operand to an integer and copies it to the lowest element of the 128-bit @@ -105,7 +105,7 @@ /// values. #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) -/// \brief Copies the upper element of the first 128-bit vector operand to the +/// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. /// Rounds up the lower element of the second 128-bit vector operand to an /// integer and copies it to the lower element of the 128-bit result vector @@ -130,7 +130,7 @@ /// values. #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) -/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an +/// Rounds down each element of the 128-bit vector of [4 x float] to an /// an integer and returns the rounded values in a 128-bit vector of /// [4 x float]. /// @@ -147,7 +147,7 @@ /// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) -/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an +/// Rounds down each element of the 128-bit vector of [2 x double] to an /// integer and returns the rounded values in a 128-bit vector of /// [2 x double]. /// @@ -164,7 +164,7 @@ /// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) -/// \brief Copies three upper elements of the first 128-bit vector operand to +/// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of /// [4 x float]. Rounds down the lowest element of the second 128-bit vector /// operand to an integer and copies it to the lowest element of the 128-bit @@ -189,7 +189,7 @@ /// values. #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) -/// \brief Copies the upper element of the first 128-bit vector operand to the +/// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. /// Rounds down the lower element of the second 128-bit vector operand to an /// integer and copies it to the lower element of the 128-bit result vector @@ -214,7 +214,7 @@ /// values. #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) -/// \brief Rounds each element of the 128-bit vector of [4 x float] to an +/// Rounds each element of the 128-bit vector of [4 x float] to an /// integer value according to the rounding control specified by the second /// argument and returns the rounded values in a 128-bit vector of /// [4 x float]. @@ -244,10 +244,10 @@ /// 10: Upward (toward positive infinity) \n /// 11: Truncated /// \returns A 128-bit vector of [4 x float] containing the rounded values. -#define _mm_round_ps(X, M) __extension__ ({ \ - (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) +#define _mm_round_ps(X, M) \ + (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)) -/// \brief Copies three upper elements of the first 128-bit vector operand to +/// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of /// [4 x float]. Rounds the lowest element of the second 128-bit vector /// operand to an integer value according to the rounding control specified @@ -285,11 +285,11 @@ /// 11: Truncated /// \returns A 128-bit vector of [4 x float] containing the copied and rounded /// values. -#define _mm_round_ss(X, Y, M) __extension__ ({ \ +#define _mm_round_ss(X, Y, M) \ (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M)); }) + (__v4sf)(__m128)(Y), (M)) -/// \brief Rounds each element of the 128-bit vector of [2 x double] to an +/// Rounds each element of the 128-bit vector of [2 x double] to an /// integer value according to the rounding control specified by the second /// argument and returns the rounded values in a 128-bit vector of /// [2 x double]. @@ -319,10 +319,10 @@ /// 10: Upward (toward positive infinity) \n /// 11: Truncated /// \returns A 128-bit vector of [2 x double] containing the rounded values. -#define _mm_round_pd(X, M) __extension__ ({ \ - (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) +#define _mm_round_pd(X, M) \ + (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)) -/// \brief Copies the upper element of the first 128-bit vector operand to the +/// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. /// Rounds the lower element of the second 128-bit vector operand to an /// integer value according to the rounding control specified by the third @@ -360,12 +360,12 @@ /// 11: Truncated /// \returns A 128-bit vector of [2 x double] containing the copied and rounded /// values. -#define _mm_round_sd(X, Y, M) __extension__ ({ \ +#define _mm_round_sd(X, Y, M) \ (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M)); }) + (__v2df)(__m128d)(Y), (M)) /* SSE4 Packed Blending Intrinsics. */ -/// \brief Returns a 128-bit vector of [2 x double] where the values are +/// Returns a 128-bit vector of [2 x double] where the values are /// selected from either the first or second operand as specified by the /// third operand, the control mask. /// @@ -389,13 +389,11 @@ /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 /// is copied to the same position in the result. /// \returns A 128-bit vector of [2 x double] containing the copied values. -#define _mm_blend_pd(V1, V2, M) __extension__ ({ \ - (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ - (__v2df)(__m128d)(V2), \ - (((M) & 0x01) ? 2 : 0), \ - (((M) & 0x02) ? 3 : 1)); }) +#define _mm_blend_pd(V1, V2, M) \ + (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M)) -/// \brief Returns a 128-bit vector of [4 x float] where the values are selected +/// Returns a 128-bit vector of [4 x float] where the values are selected /// from either the first or second operand as specified by the third /// operand, the control mask. /// @@ -419,14 +417,11 @@ /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 /// is copied to the same position in the result. /// \returns A 128-bit vector of [4 x float] containing the copied values. -#define _mm_blend_ps(V1, V2, M) __extension__ ({ \ - (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)); }) - -/// \brief Returns a 128-bit vector of [2 x double] where the values are +#define _mm_blend_ps(V1, V2, M) \ + (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ + (__v4sf)(__m128)(V2), (int)(M)) + +/// Returns a 128-bit vector of [2 x double] where the values are /// selected from either the first or second operand as specified by the /// third operand, the control mask. /// @@ -453,7 +448,7 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) (__v2df)__M); } -/// \brief Returns a 128-bit vector of [4 x float] where the values are +/// Returns a 128-bit vector of [4 x float] where the values are /// selected from either the first or second operand as specified by the /// third operand, the control mask. /// @@ -480,7 +475,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) (__v4sf)__M); } -/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected +/// Returns a 128-bit vector of [16 x i8] where the values are selected /// from either of the first or second operand as specified by the third /// operand, the control mask. /// @@ -493,7 +488,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) /// \param __V2 /// A 128-bit vector of [16 x i8]. /// \param __M -/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying +/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying /// how the values are to be copied. The position of the mask bit corresponds /// to the most significant bit of a copied value. When a mask bit is 0, the /// corresponding 8-bit element in operand \a __V1 is copied to the same @@ -507,7 +502,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) (__v16qi)__M); } -/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected +/// Returns a 128-bit vector of [8 x i16] where the values are selected /// from either of the first or second operand as specified by the third /// operand, the control mask. /// @@ -531,20 +526,12 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 /// is copied to the same position in the result. /// \returns A 128-bit vector of [8 x i16] containing the copied values. -#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ - (__v8hi)(__m128i)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)); }) +#define _mm_blend_epi16(V1, V2, M) \ + (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ + (__v8hi)(__m128i)(V2), (int)(M)) /* SSE4 Dword Multiply Instructions. */ -/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32] +/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] /// and returns the lower 32 bits of the each product in a 128-bit vector of /// [4 x i32]. /// @@ -563,7 +550,7 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2) return (__m128i) ((__v4su)__V1 * (__v4su)__V2); } -/// \brief Multiplies corresponding even-indexed elements of two 128-bit +/// Multiplies corresponding even-indexed elements of two 128-bit /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] /// containing the products. /// @@ -584,7 +571,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) } /* SSE4 Floating Point Dot Product Instructions. */ -/// \brief Computes the dot product of the two 128-bit vectors of [4 x float] +/// Computes the dot product of the two 128-bit vectors of [4 x float] /// and returns it in the elements of the 128-bit result vector of /// [4 x float]. /// @@ -616,11 +603,11 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /// each [4 x float] subvector. If a bit is set, the dot product is returned /// in the corresponding element; otherwise that element is set to zero. /// \returns A 128-bit vector of [4 x float] containing the dot product. -#define _mm_dp_ps(X, Y, M) __extension__ ({ \ +#define _mm_dp_ps(X, Y, M) \ (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M)); }) + (__v4sf)(__m128)(Y), (M)) -/// \brief Computes the dot product of the two 128-bit vectors of [2 x double] +/// Computes the dot product of the two 128-bit vectors of [2 x double] /// and returns it in the elements of the 128-bit result vector of /// [2 x double]. /// @@ -651,12 +638,12 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /// to the lowest element and bit [1] corresponding to the highest element of /// each [2 x double] vector. If a bit is set, the dot product is returned in /// the corresponding element; otherwise that element is set to zero. -#define _mm_dp_pd(X, Y, M) __extension__ ({\ +#define _mm_dp_pd(X, Y, M) \ (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M)); }) + (__v2df)(__m128d)(Y), (M)) /* SSE4 Streaming Load Hint Instruction. */ -/// \brief Loads integer values from a 128-bit aligned memory location to a +/// Loads integer values from a 128-bit aligned memory location to a /// 128-bit integer vector. /// /// \headerfile <x86intrin.h> @@ -675,7 +662,7 @@ _mm_stream_load_si128 (__m128i const *__V) } /* SSE4 Packed Integer Min/Max Instructions. */ -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser /// of the two values. /// @@ -694,7 +681,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the /// greater value of the two. /// @@ -713,7 +700,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser /// value of the two. /// @@ -732,7 +719,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the /// greater value of the two. /// @@ -751,7 +738,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser /// value of the two. /// @@ -770,7 +757,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the /// greater value of the two. /// @@ -789,7 +776,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser /// value of the two. /// @@ -808,7 +795,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the /// greater value of the two. /// @@ -828,7 +815,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) } /* SSE4 Insertion and Extraction from XMM Register Instructions. */ -/// \brief Takes the first argument \a X and inserts an element from the second +/// Takes the first argument \a X and inserts an element from the second /// argument \a Y as selected by the third argument \a N. That result then /// has elements zeroed out also as selected by the third argument \a N. The /// resulting 128-bit vector of [4 x float] is then returned. @@ -870,7 +857,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// single-precision floating point elements from the operands. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) -/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and +/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and /// returns it, using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -893,15 +880,14 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Bits [127:96] of parameter \a X are returned. /// \returns A 32-bit integer containing the extracted 32 bits of float data. #define _mm_extract_ps(X, N) (__extension__ \ - ({ union { int __i; float __f; } __t; \ - __v4sf __a = (__v4sf)(__m128)(X); \ - __t.__f = __a[(N) & 3]; \ - __t.__i;})) + ({ union { int __i; float __f; } __t; \ + __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ + __t.__i;})) /* Miscellaneous insert and extract macros. */ /* Extract a single-precision float from X at index N into D. */ -#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ - (D) = __a[N]; })) +#define _MM_EXTRACT_FLOAT(D, X, N) \ + { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create an index suitable for _mm_insert_ps. */ @@ -912,7 +898,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) /* Insert int into packed integer array at index. */ -/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of +/// Constructs a 128-bit vector of [16 x i8] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the lower 8 bits /// of an integer parameter \a I into an offset specified by the immediate /// value parameter \a N. @@ -952,12 +938,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 1110: Bits [119:112] of the result are used for insertion. \n /// 1111: Bits [127:120] of the result are used for insertion. /// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi8(X, I, N) (__extension__ \ - ({ __v16qi __a = (__v16qi)(__m128i)(X); \ - __a[(N) & 15] = (I); \ - (__m128i)__a;})) +#define _mm_insert_epi8(X, I, N) \ + (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ + (int)(I), (int)(N)) -/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of +/// Constructs a 128-bit vector of [4 x i32] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the 32-bit /// integer parameter \a I at the offset specified by the immediate value /// parameter \a N. @@ -985,13 +970,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 10: Bits [95:64] of the result are used for insertion. \n /// 11: Bits [127:96] of the result are used for insertion. /// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi32(X, I, N) (__extension__ \ - ({ __v4si __a = (__v4si)(__m128i)(X); \ - __a[(N) & 3] = (I); \ - (__m128i)__a;})) +#define _mm_insert_epi32(X, I, N) \ + (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ + (int)(I), (int)(N)) #ifdef __x86_64__ -/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of +/// Constructs a 128-bit vector of [2 x i64] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the 64-bit /// integer parameter \a I, using the immediate value parameter \a N as an /// insertion location selector. @@ -1017,16 +1001,15 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 0: Bits [63:0] of the result are used for insertion. \n /// 1: Bits [127:64] of the result are used for insertion. \n /// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi64(X, I, N) (__extension__ \ - ({ __v2di __a = (__v2di)(__m128i)(X); \ - __a[(N) & 1] = (I); \ - (__m128i)__a;})) +#define _mm_insert_epi64(X, I, N) \ + (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ + (long long)(I), (int)(N)) #endif /* __x86_64__ */ /* Extract int from packed integer array at index. This returns the element * as a zero extended value, so it is unsigned. */ -/// \brief Extracts an 8-bit element from the 128-bit integer vector of +/// Extracts an 8-bit element from the 128-bit integer vector of /// [16 x i8], using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -1061,11 +1044,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// \returns An unsigned integer, whose lower 8 bits are selected from the /// 128-bit integer vector parameter and the remaining bits are assigned /// zeros. -#define _mm_extract_epi8(X, N) (__extension__ \ - ({ __v16qi __a = (__v16qi)(__m128i)(X); \ - (int)(unsigned char) __a[(N) & 15];})) +#define _mm_extract_epi8(X, N) \ + (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ + (int)(N)) -/// \brief Extracts a 32-bit element from the 128-bit integer vector of +/// Extracts a 32-bit element from the 128-bit integer vector of /// [4 x i32], using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -1087,12 +1070,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Bits [127:96] of the parameter \a X are exracted. /// \returns An integer, whose lower 32 bits are selected from the 128-bit /// integer vector parameter and the remaining bits are assigned zeros. -#define _mm_extract_epi32(X, N) (__extension__ \ - ({ __v4si __a = (__v4si)(__m128i)(X); \ - (int)__a[(N) & 3];})) +#define _mm_extract_epi32(X, N) \ + (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)) #ifdef __x86_64__ -/// \brief Extracts a 64-bit element from the 128-bit integer vector of +/// Extracts a 64-bit element from the 128-bit integer vector of /// [2 x i64], using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -1111,13 +1093,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 0: Bits [63:0] are returned. \n /// 1: Bits [127:64] are returned. \n /// \returns A 64-bit integer. -#define _mm_extract_epi64(X, N) (__extension__ \ - ({ __v2di __a = (__v2di)(__m128i)(X); \ - (long long)__a[(N) & 1];})) +#define _mm_extract_epi64(X, N) \ + (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)) #endif /* __x86_64 */ /* SSE4 128-bit Packed Integer Comparisons. */ -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. /// /// \headerfile <x86intrin.h> @@ -1135,7 +1116,7 @@ _mm_testz_si128(__m128i __M, __m128i __V) return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); } -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// ones. /// /// \headerfile <x86intrin.h> @@ -1153,7 +1134,7 @@ _mm_testc_si128(__m128i __M, __m128i __V) return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); } -/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// Tests whether the specified bits in a 128-bit integer vector are /// neither all zeros nor all ones. /// /// \headerfile <x86intrin.h> @@ -1172,7 +1153,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); } -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// ones. /// /// \headerfile <x86intrin.h> @@ -1189,7 +1170,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) /// otherwise. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) -/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// Tests whether the specified bits in a 128-bit integer vector are /// neither all zeros nor all ones. /// /// \headerfile <x86intrin.h> @@ -1208,7 +1189,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) /// FALSE otherwise. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. /// /// \headerfile <x86intrin.h> @@ -1227,7 +1208,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) /* SSE4 64-bit Packed Integer Comparisons. */ -/// \brief Compares each of the corresponding 64-bit values of the 128-bit +/// Compares each of the corresponding 64-bit values of the 128-bit /// integer vectors for equality. /// /// \headerfile <x86intrin.h> @@ -1246,7 +1227,7 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) } /* SSE4 Packed Integer Sign-Extension. */ -/// \brief Sign-extends each of the lower eight 8-bit integer elements of a +/// Sign-extends each of the lower eight 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector /// are unused. @@ -1267,7 +1248,7 @@ _mm_cvtepi8_epi16(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } -/// \brief Sign-extends each of the lower four 8-bit integer elements of a +/// Sign-extends each of the lower four 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a /// 128-bit vector of [4 x i32]. The upper twelve elements of the input /// vector are unused. @@ -1277,8 +1258,8 @@ _mm_cvtepi8_epi16(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign- -/// extended to 32-bit values. +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are +/// sign-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) @@ -1288,7 +1269,7 @@ _mm_cvtepi8_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); } -/// \brief Sign-extends each of the lower two 8-bit integer elements of a +/// Sign-extends each of the lower two 8-bit integer elements of a /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input /// vector are unused. @@ -1298,8 +1279,8 @@ _mm_cvtepi8_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign- -/// extended to 64-bit values. +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are +/// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) @@ -1309,7 +1290,7 @@ _mm_cvtepi8_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); } -/// \brief Sign-extends each of the lower four 16-bit integer elements of a +/// Sign-extends each of the lower four 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in /// a 128-bit vector of [4 x i32]. The upper four elements of the input /// vector are unused. @@ -1319,8 +1300,8 @@ _mm_cvtepi8_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign- -/// extended to 32-bit values. +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are +/// sign-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) @@ -1328,7 +1309,7 @@ _mm_cvtepi16_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); } -/// \brief Sign-extends each of the lower two 16-bit integer elements of a +/// Sign-extends each of the lower two 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper six elements of the input /// vector are unused. @@ -1338,8 +1319,8 @@ _mm_cvtepi16_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign- -/// extended to 64-bit values. +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are +/// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) @@ -1347,7 +1328,7 @@ _mm_cvtepi16_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); } -/// \brief Sign-extends each of the lower two 32-bit integer elements of a +/// Sign-extends each of the lower two 32-bit integer elements of a /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector /// are unused. @@ -1357,8 +1338,8 @@ _mm_cvtepi16_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign- -/// extended to 64-bit values. +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are +/// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) @@ -1367,7 +1348,7 @@ _mm_cvtepi32_epi64(__m128i __V) } /* SSE4 Packed Integer Zero-Extension. */ -/// \brief Zero-extends each of the lower eight 8-bit integer elements of a +/// Zero-extends each of the lower eight 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector /// are unused. @@ -1377,8 +1358,8 @@ _mm_cvtepi32_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero- -/// extended to 16-bit values. +/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are +/// zero-extended to 16-bit values. /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) @@ -1386,7 +1367,7 @@ _mm_cvtepu8_epi16(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } -/// \brief Zero-extends each of the lower four 8-bit integer elements of a +/// Zero-extends each of the lower four 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a /// 128-bit vector of [4 x i32]. The upper twelve elements of the input /// vector are unused. @@ -1396,8 +1377,8 @@ _mm_cvtepu8_epi16(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero- -/// extended to 32-bit values. +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are +/// zero-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) @@ -1405,7 +1386,7 @@ _mm_cvtepu8_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); } -/// \brief Zero-extends each of the lower two 8-bit integer elements of a +/// Zero-extends each of the lower two 8-bit integer elements of a /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input /// vector are unused. @@ -1415,8 +1396,8 @@ _mm_cvtepu8_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero- -/// extended to 64-bit values. +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are +/// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) @@ -1424,7 +1405,7 @@ _mm_cvtepu8_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); } -/// \brief Zero-extends each of the lower four 16-bit integer elements of a +/// Zero-extends each of the lower four 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in /// a 128-bit vector of [4 x i32]. The upper four elements of the input /// vector are unused. @@ -1434,8 +1415,8 @@ _mm_cvtepu8_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero- -/// extended to 32-bit values. +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are +/// zero-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) @@ -1443,7 +1424,7 @@ _mm_cvtepu16_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); } -/// \brief Zero-extends each of the lower two 16-bit integer elements of a +/// Zero-extends each of the lower two 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector /// are unused. @@ -1453,8 +1434,8 @@ _mm_cvtepu16_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero- -/// extended to 64-bit values. +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are +/// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) @@ -1462,7 +1443,7 @@ _mm_cvtepu16_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); } -/// \brief Zero-extends each of the lower two 32-bit integer elements of a +/// Zero-extends each of the lower two 32-bit integer elements of a /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector /// are unused. @@ -1472,8 +1453,8 @@ _mm_cvtepu16_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero- -/// extended to 64-bit values. +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are +/// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) @@ -1482,7 +1463,7 @@ _mm_cvtepu32_epi64(__m128i __V) } /* SSE4 Pack with Unsigned Saturation. */ -/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// Converts 32-bit signed integers from both 128-bit integer vector /// operands into 16-bit unsigned integers, and returns the packed result. /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than /// 0x0000 are saturated to 0x0000. @@ -1511,7 +1492,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) } /* SSE4 Multiple Packed Sums of Absolute Difference. */ -/// \brief Subtracts 8-bit unsigned integer values and computes the absolute +/// Subtracts 8-bit unsigned integer values and computes the absolute /// values of the differences to the corresponding bits in the destination. /// Then sums of the absolute differences are returned according to the bit /// fields in the immediate operand. @@ -1534,23 +1515,23 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) /// \code /// // M2 represents bit 2 of the immediate operand /// // M10 represents bits [1:0] of the immediate operand -/// i = M2 * 4 -/// j = M10 * 4 +/// i = M2 * 4; +/// j = M10 * 4; /// for (k = 0; k < 8; k = k + 1) { -/// d0 = abs(X[i + k + 0] - Y[j + 0]) -/// d1 = abs(X[i + k + 1] - Y[j + 1]) -/// d2 = abs(X[i + k + 2] - Y[j + 2]) -/// d3 = abs(X[i + k + 3] - Y[j + 3]) -/// r[k] = d0 + d1 + d2 + d3 +/// d0 = abs(X[i + k + 0] - Y[j + 0]); +/// d1 = abs(X[i + k + 1] - Y[j + 1]); +/// d2 = abs(X[i + k + 2] - Y[j + 2]); +/// d3 = abs(X[i + k + 3] - Y[j + 3]); +/// r[k] = d0 + d1 + d2 + d3; /// } /// \endcode /// \returns A 128-bit integer vector containing the sums of the sets of /// absolute differences between both operands. -#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ +#define _mm_mpsadbw_epu8(X, Y, M) \ (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (M)); }) + (__v16qi)(__m128i)(Y), (M)) -/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit +/// Finds the minimum unsigned 16-bit element in the input 128-bit /// vector of [8 x u16] and returns it and along with its index. /// /// \headerfile <x86intrin.h> @@ -1604,7 +1585,7 @@ _mm_minpos_epu16(__m128i __V) #define _SIDD_UNIT_MASK 0x40 /* SSE4.2 Packed Comparison Intrinsics. */ -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns a 128-bit integer vector representing the result /// mask of the comparison. @@ -1660,7 +1641,7 @@ _mm_minpos_epu16(__m128i __V) (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns an integer representing the result index of the /// comparison. @@ -1714,7 +1695,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns a 128-bit integer vector representing the result /// mask of the comparison. @@ -1775,7 +1756,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns an integer representing the result index of the /// comparison. @@ -1835,7 +1816,7 @@ _mm_minpos_epu16(__m128i __V) (int)(M)) /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the /// string in \a B is the maximum, otherwise, returns 0. @@ -1885,7 +1866,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns /// 0. @@ -1934,7 +1915,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns bit 0 of the resulting bit mask. /// @@ -1982,7 +1963,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a A is less than /// the maximum, otherwise, returns 0. @@ -2032,7 +2013,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a B is less than /// the maximum, otherwise, returns 0. @@ -2082,7 +2063,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the /// string in \a B is the maximum, otherwise, returns 0. @@ -2137,7 +2118,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, /// returns 0. @@ -2191,7 +2172,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns bit 0 of the resulting bit mask. /// @@ -2244,7 +2225,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a A is less than /// the maximum, otherwise, returns 0. @@ -2299,7 +2280,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a B is less than /// the maximum, otherwise, returns 0. @@ -2354,7 +2335,7 @@ _mm_minpos_epu16(__m128i __V) (int)(M)) /* SSE4.2 Compare Packed Data -- Greater Than. */ -/// \brief Compares each of the corresponding 64-bit values of the 128-bit +/// Compares each of the corresponding 64-bit values of the 128-bit /// integer vectors to determine if the values in the first operand are /// greater than those in the second operand. /// @@ -2374,7 +2355,7 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) } /* SSE4.2 Accumulate CRC32. */ -/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// Adds the unsigned integer operand to the CRC-32C checksum of the /// unsigned char operand. /// /// \headerfile <x86intrin.h> @@ -2394,7 +2375,7 @@ _mm_crc32_u8(unsigned int __C, unsigned char __D) return __builtin_ia32_crc32qi(__C, __D); } -/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// Adds the unsigned integer operand to the CRC-32C checksum of the /// unsigned short operand. /// /// \headerfile <x86intrin.h> @@ -2414,7 +2395,7 @@ _mm_crc32_u16(unsigned int __C, unsigned short __D) return __builtin_ia32_crc32hi(__C, __D); } -/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of +/// Adds the first unsigned integer operand to the CRC-32C checksum of /// the second unsigned integer operand. /// /// \headerfile <x86intrin.h> @@ -2435,7 +2416,7 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D) } #ifdef __x86_64__ -/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// Adds the unsigned integer operand to the CRC-32C checksum of the /// unsigned 64-bit integer operand. /// /// \headerfile <x86intrin.h> @@ -2458,8 +2439,6 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D) #undef __DEFAULT_FN_ATTRS -#ifdef __POPCNT__ #include <popcntintrin.h> -#endif -#endif /* _SMMINTRIN_H */ +#endif /* __SMMINTRIN_H */ diff --git a/c_headers/stdint.h b/c_headers/stdint.h index c48815314b..0afcca3a9d 100644 --- a/c_headers/stdint.h +++ b/c_headers/stdint.h @@ -88,7 +88,7 @@ * * To accommodate targets that are missing types that are exactly 8, 16, 32, or * 64 bits wide, this implementation takes an approach of cascading - * redefintions, redefining __int_leastN_t to successively smaller exact-width + * redefinitions, redefining __int_leastN_t to successively smaller exact-width * types. It is therefore important that the types are defined in order of * descending widths. * @@ -461,7 +461,7 @@ typedef __UINTMAX_TYPE__ uintmax_t; * As in the type definitions, this section takes an approach of * successive-shrinking to determine which limits to use for the standard (8, * 16, 32, 64) bit widths when they don't have exact representations. It is - * therefore important that the defintions be kept in order of decending + * therefore important that the definitions be kept in order of decending * widths. * * Note that C++ should not check __STDC_LIMIT_MACROS here, contrary to the diff --git a/c_headers/tmmintrin.h b/c_headers/tmmintrin.h index 042bfc7e3b..734cd391be 100644 --- a/c_headers/tmmintrin.h +++ b/c_headers/tmmintrin.h @@ -27,9 +27,10 @@ #include <pmmintrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64))) -/// \brief Computes the absolute value of each of the packed 8-bit signed +/// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer /// results in the destination. /// @@ -41,13 +42,13 @@ /// A 64-bit vector of [8 x i8]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_abs_pi8(__m64 __a) { return (__m64)__builtin_ia32_pabsb((__v8qi)__a); } -/// \brief Computes the absolute value of each of the packed 8-bit signed +/// Computes the absolute value of each of the packed 8-bit signed /// integers in the source operand and stores the 8-bit unsigned integer /// results in the destination. /// @@ -65,7 +66,7 @@ _mm_abs_epi8(__m128i __a) return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a); } -/// \brief Computes the absolute value of each of the packed 16-bit signed +/// Computes the absolute value of each of the packed 16-bit signed /// integers in the source operand and stores the 16-bit unsigned integer /// results in the destination. /// @@ -77,13 +78,13 @@ _mm_abs_epi8(__m128i __a) /// A 64-bit vector of [4 x i16]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_abs_pi16(__m64 __a) { return (__m64)__builtin_ia32_pabsw((__v4hi)__a); } -/// \brief Computes the absolute value of each of the packed 16-bit signed +/// Computes the absolute value of each of the packed 16-bit signed /// integers in the source operand and stores the 16-bit unsigned integer /// results in the destination. /// @@ -101,7 +102,7 @@ _mm_abs_epi16(__m128i __a) return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a); } -/// \brief Computes the absolute value of each of the packed 32-bit signed +/// Computes the absolute value of each of the packed 32-bit signed /// integers in the source operand and stores the 32-bit unsigned integer /// results in the destination. /// @@ -113,13 +114,13 @@ _mm_abs_epi16(__m128i __a) /// A 64-bit vector of [2 x i32]. /// \returns A 64-bit integer vector containing the absolute values of the /// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_abs_pi32(__m64 __a) { return (__m64)__builtin_ia32_pabsd((__v2si)__a); } -/// \brief Computes the absolute value of each of the packed 32-bit signed +/// Computes the absolute value of each of the packed 32-bit signed /// integers in the source operand and stores the 32-bit unsigned integer /// results in the destination. /// @@ -137,7 +138,7 @@ _mm_abs_epi32(__m128i __a) return (__m128i)__builtin_ia32_pabsd128((__v4si)__a); } -/// \brief Concatenates the two 128-bit integer vector operands, and +/// Concatenates the two 128-bit integer vector operands, and /// right-shifts the result by the number of bytes specified in the immediate /// operand. /// @@ -157,11 +158,11 @@ _mm_abs_epi32(__m128i __a) /// An immediate operand specifying how many bytes to right-shift the result. /// \returns A 128-bit integer vector containing the concatenated right-shifted /// value. -#define _mm_alignr_epi8(a, b, n) __extension__ ({ \ +#define _mm_alignr_epi8(a, b, n) \ (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (n)); }) + (__v16qi)(__m128i)(b), (n)) -/// \brief Concatenates the two 64-bit integer vector operands, and right-shifts +/// Concatenates the two 64-bit integer vector operands, and right-shifts /// the result by the number of bytes specified in the immediate operand. /// /// \headerfile <x86intrin.h> @@ -180,10 +181,10 @@ _mm_abs_epi32(__m128i __a) /// An immediate operand specifying how many bytes to right-shift the result. /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. -#define _mm_alignr_pi8(a, b, n) __extension__ ({ \ - (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)); }) +#define _mm_alignr_pi8(a, b, n) \ + (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)) -/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. /// /// \headerfile <x86intrin.h> @@ -206,7 +207,7 @@ _mm_hadd_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [4 x i32]. /// /// \headerfile <x86intrin.h> @@ -229,7 +230,7 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// Horizontally adds the adjacent pairs of values contained in 2 packed /// 64-bit vectors of [4 x i16]. /// /// \headerfile <x86intrin.h> @@ -246,13 +247,13 @@ _mm_hadd_epi32(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_hadd_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed +/// Horizontally adds the adjacent pairs of values contained in 2 packed /// 64-bit vectors of [2 x i32]. /// /// \headerfile <x86intrin.h> @@ -269,15 +270,16 @@ _mm_hadd_pi16(__m64 __a, __m64 __b) /// destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both /// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_hadd_pi32(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed -/// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are -/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are +/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to +/// 0x8000. /// /// \headerfile <x86intrin.h> /// @@ -299,9 +301,10 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Horizontally adds the adjacent pairs of values contained in 2 packed -/// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are -/// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h. +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are +/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to +/// 0x8000. /// /// \headerfile <x86intrin.h> /// @@ -317,13 +320,13 @@ _mm_hadds_epi16(__m128i __a, __m128i __b) /// destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_hadds_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 128-bit vectors of [8 x i16]. /// /// \headerfile <x86intrin.h> @@ -346,7 +349,7 @@ _mm_hsub_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 128-bit vectors of [4 x i32]. /// /// \headerfile <x86intrin.h> @@ -369,7 +372,7 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 64-bit vectors of [4 x i16]. /// /// \headerfile <x86intrin.h> @@ -386,13 +389,13 @@ _mm_hsub_epi32(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_hsub_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 64-bit vectors of [2 x i32]. /// /// \headerfile <x86intrin.h> @@ -409,16 +412,16 @@ _mm_hsub_pi16(__m64 __a, __m64 __b) /// the destination. /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences /// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_hsub_pi32(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 128-bit vectors of [8 x i16]. Positive differences greater than -/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are -/// saturated to 8000h. +/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are +/// saturated to 0x8000. /// /// \headerfile <x86intrin.h> /// @@ -440,10 +443,10 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Horizontally subtracts the adjacent pairs of values contained in 2 +/// Horizontally subtracts the adjacent pairs of values contained in 2 /// packed 64-bit vectors of [4 x i16]. Positive differences greater than -/// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are -/// saturated to 8000h. +/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are +/// saturated to 0x8000. /// /// \headerfile <x86intrin.h> /// @@ -459,13 +462,13 @@ _mm_hsubs_epi16(__m128i __a, __m128i __b) /// the destination. /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated /// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_hsubs_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); } -/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer +/// Multiplies corresponding pairs of packed 8-bit unsigned integer /// values contained in the first source operand and packed 8-bit signed /// integer values contained in the second source operand, adds pairs of /// contiguous products with signed saturation, and writes the 16-bit sums to @@ -499,7 +502,7 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); } -/// \brief Multiplies corresponding pairs of packed 8-bit unsigned integer +/// Multiplies corresponding pairs of packed 8-bit unsigned integer /// values contained in the first source operand and packed 8-bit signed /// integer values contained in the second source operand, adds pairs of /// contiguous products with signed saturation, and writes the 16-bit sums to @@ -523,13 +526,13 @@ _mm_maddubs_epi16(__m128i __a, __m128i __b) /// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n /// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n /// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_maddubs_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); } -/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit /// products to the 18 most significant bits by right-shifting, rounds the /// truncated value by adding 1, and writes bits [16:1] to the destination. /// @@ -549,7 +552,7 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); } -/// \brief Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit /// products to the 18 most significant bits by right-shifting, rounds the /// truncated value by adding 1, and writes bits [16:1] to the destination. /// @@ -563,13 +566,13 @@ _mm_mulhrs_epi16(__m128i __a, __m128i __b) /// A 64-bit vector of [4 x i16] containing one of the source operands. /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled /// products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhrs_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); } -/// \brief Copies the 8-bit integers from a 128-bit integer vector to the +/// Copies the 8-bit integers from a 128-bit integer vector to the /// destination or clears 8-bit values in the destination, as specified by /// the second source operand. /// @@ -595,7 +598,7 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b); } -/// \brief Copies the 8-bit integers from a 64-bit integer vector to the +/// Copies the 8-bit integers from a 64-bit integer vector to the /// destination or clears 8-bit values in the destination, as specified by /// the second source operand. /// @@ -614,13 +617,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b) /// destination. \n /// Bits [3:0] select the source byte to be copied. /// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_shuffle_pi8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); } -/// \brief For each 8-bit integer in the first source operand, perform one of +/// For each 8-bit integer in the first source operand, perform one of /// the following actions as specified by the second source operand. /// /// If the byte in the second source is negative, calculate the two's @@ -646,7 +649,7 @@ _mm_sign_epi8(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b); } -/// \brief For each 16-bit integer in the first source operand, perform one of +/// For each 16-bit integer in the first source operand, perform one of /// the following actions as specified by the second source operand. /// /// If the word in the second source is negative, calculate the two's @@ -672,7 +675,7 @@ _mm_sign_epi16(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b); } -/// \brief For each 32-bit integer in the first source operand, perform one of +/// For each 32-bit integer in the first source operand, perform one of /// the following actions as specified by the second source operand. /// /// If the doubleword in the second source is negative, calculate the two's @@ -698,7 +701,7 @@ _mm_sign_epi32(__m128i __a, __m128i __b) return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b); } -/// \brief For each 8-bit integer in the first source operand, perform one of +/// For each 8-bit integer in the first source operand, perform one of /// the following actions as specified by the second source operand. /// /// If the byte in the second source is negative, calculate the two's @@ -718,13 +721,13 @@ _mm_sign_epi32(__m128i __a, __m128i __b) /// A 64-bit integer vector containing control bytes corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sign_pi8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); } -/// \brief For each 16-bit integer in the first source operand, perform one of +/// For each 16-bit integer in the first source operand, perform one of /// the following actions as specified by the second source operand. /// /// If the word in the second source is negative, calculate the two's @@ -744,13 +747,13 @@ _mm_sign_pi8(__m64 __a, __m64 __b) /// A 64-bit integer vector containing control words corresponding to /// positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sign_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); } -/// \brief For each 32-bit integer in the first source operand, perform one of +/// For each 32-bit integer in the first source operand, perform one of /// the following actions as specified by the second source operand. /// /// If the doubleword in the second source is negative, calculate the two's @@ -770,12 +773,13 @@ _mm_sign_pi16(__m64 __a, __m64 __b) /// A 64-bit integer vector containing two control doublewords corresponding /// to positions in the destination. /// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sign_pi32(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_MMX #endif /* __TMMINTRIN_H */ diff --git a/c_headers/vaesintrin.h b/c_headers/vaesintrin.h index efbb8a5652..e4174bb82f 100644 --- a/c_headers/vaesintrin.h +++ b/c_headers/vaesintrin.h @@ -29,10 +29,10 @@ #define __VAESINTRIN_H /* Default attributes for YMM forms. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256))) /* Default attributes for ZMM forms. */ -#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"))) +#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512))) static __inline__ __m256i __DEFAULT_FN_ATTRS diff --git a/c_headers/vpclmulqdqintrin.h b/c_headers/vpclmulqdqintrin.h index 21cda22210..86174a457e 100644 --- a/c_headers/vpclmulqdqintrin.h +++ b/c_headers/vpclmulqdqintrin.h @@ -28,15 +28,15 @@ #ifndef __VPCLMULQDQINTRIN_H #define __VPCLMULQDQINTRIN_H -#define _mm256_clmulepi64_epi128(A, B, I) __extension__ ({ \ +#define _mm256_clmulepi64_epi128(A, B, I) \ (__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ (__v4di)(__m256i)(B), \ - (char)(I)); }) + (char)(I)) -#define _mm512_clmulepi64_epi128(A, B, I) __extension__ ({ \ +#define _mm512_clmulepi64_epi128(A, B, I) \ (__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ (__v8di)(__m512i)(B), \ - (char)(I)); }) + (char)(I)) -#endif // __VPCLMULQDQINTRIN_H +#endif /* __VPCLMULQDQINTRIN_H */ diff --git a/c_headers/waitpkgintrin.h b/c_headers/waitpkgintrin.h new file mode 100644 index 0000000000..e29d6cfa5a --- /dev/null +++ b/c_headers/waitpkgintrin.h @@ -0,0 +1,56 @@ +/*===----------------------- waitpkgintrin.h - WAITPKG --------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __WAITPKGINTRIN_H +#define __WAITPKGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("waitpkg"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_umonitor (void * __address) +{ + __builtin_ia32_umonitor (__address); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_umwait (unsigned int __control, unsigned long long __counter) +{ + return __builtin_ia32_umwait (__control, + (unsigned int)(__counter >> 32), (unsigned int)__counter); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_tpause (unsigned int __control, unsigned long long __counter) +{ + return __builtin_ia32_tpause (__control, + (unsigned int)(__counter >> 32), (unsigned int)__counter); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __WAITPKGINTRIN_H */ diff --git a/c_headers/wbnoinvdintrin.h b/c_headers/wbnoinvdintrin.h new file mode 100644 index 0000000000..cad83368db --- /dev/null +++ b/c_headers/wbnoinvdintrin.h @@ -0,0 +1,38 @@ +/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead." +#endif + +#ifndef __WBNOINVDINTRIN_H +#define __WBNOINVDINTRIN_H + +static __inline__ void + __attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd"))) +_wbnoinvd (void) +{ + __builtin_ia32_wbnoinvd (); +} + +#endif /* __WBNOINVDINTRIN_H */ diff --git a/c_headers/wmmintrin.h b/c_headers/wmmintrin.h index a2d931010a..569a8d838d 100644 --- a/c_headers/wmmintrin.h +++ b/c_headers/wmmintrin.h @@ -21,8 +21,8 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _WMMINTRIN_H -#define _WMMINTRIN_H +#ifndef __WMMINTRIN_H +#define __WMMINTRIN_H #include <emmintrin.h> @@ -30,4 +30,4 @@ #include <__wmmintrin_pclmul.h> -#endif /* _WMMINTRIN_H */ +#endif /* __WMMINTRIN_H */ diff --git a/c_headers/x86intrin.h b/c_headers/x86intrin.h index 31ee7b82dd..728c58c3eb 100644 --- a/c_headers/x86intrin.h +++ b/c_headers/x86intrin.h @@ -32,26 +32,6 @@ #include <mm3dnow.h> #endif -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) -#include <bmiintrin.h> -#endif - -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__) -#include <bmi2intrin.h> -#endif - -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__LZCNT__) -#include <lzcntintrin.h> -#endif - -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__POPCNT__) -#include <popcntintrin.h> -#endif - -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__RDSEED__) -#include <rdseedintrin.h> -#endif - #if !defined(_MSC_VER) || __has_feature(modules) || defined(__PRFCHW__) #include <prfchwintrin.h> #endif @@ -76,10 +56,6 @@ #include <lwpintrin.h> #endif -#if !defined(_MSC_VER) || __has_feature(modules) || defined(__F16C__) -#include <f16cintrin.h> -#endif - #if !defined(_MSC_VER) || __has_feature(modules) || defined(__MWAITX__) #include <mwaitxintrin.h> #endif @@ -88,4 +64,5 @@ #include <clzerointrin.h> #endif + #endif /* __X86INTRIN_H */ diff --git a/c_headers/xmmintrin.h b/c_headers/xmmintrin.h index 279c0275d9..17af17267c 100644 --- a/c_headers/xmmintrin.h +++ b/c_headers/xmmintrin.h @@ -40,9 +40,10 @@ typedef unsigned int __v4su __attribute__((__vector_size__(16))); #endif /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) -/// \brief Adds the 32-bit float values in the low-order bits of the operands. +/// Adds the 32-bit float values in the low-order bits of the operands. /// /// \headerfile <x86intrin.h> /// @@ -64,7 +65,7 @@ _mm_add_ss(__m128 __a, __m128 __b) return __a; } -/// \brief Adds two 128-bit vectors of [4 x float], and returns the results of +/// Adds two 128-bit vectors of [4 x float], and returns the results of /// the addition. /// /// \headerfile <x86intrin.h> @@ -83,7 +84,7 @@ _mm_add_ps(__m128 __a, __m128 __b) return (__m128)((__v4sf)__a + (__v4sf)__b); } -/// \brief Subtracts the 32-bit float value in the low-order bits of the second +/// Subtracts the 32-bit float value in the low-order bits of the second /// operand from the corresponding value in the first operand. /// /// \headerfile <x86intrin.h> @@ -106,7 +107,7 @@ _mm_sub_ss(__m128 __a, __m128 __b) return __a; } -/// \brief Subtracts each of the values of the second operand from the first +/// Subtracts each of the values of the second operand from the first /// operand, both of which are 128-bit vectors of [4 x float] and returns /// the results of the subtraction. /// @@ -126,7 +127,7 @@ _mm_sub_ps(__m128 __a, __m128 __b) return (__m128)((__v4sf)__a - (__v4sf)__b); } -/// \brief Multiplies two 32-bit float values in the low-order bits of the +/// Multiplies two 32-bit float values in the low-order bits of the /// operands. /// /// \headerfile <x86intrin.h> @@ -149,7 +150,7 @@ _mm_mul_ss(__m128 __a, __m128 __b) return __a; } -/// \brief Multiplies two 128-bit vectors of [4 x float] and returns the +/// Multiplies two 128-bit vectors of [4 x float] and returns the /// results of the multiplication. /// /// \headerfile <x86intrin.h> @@ -168,7 +169,7 @@ _mm_mul_ps(__m128 __a, __m128 __b) return (__m128)((__v4sf)__a * (__v4sf)__b); } -/// \brief Divides the value in the low-order 32 bits of the first operand by +/// Divides the value in the low-order 32 bits of the first operand by /// the corresponding value in the second operand. /// /// \headerfile <x86intrin.h> @@ -191,7 +192,7 @@ _mm_div_ss(__m128 __a, __m128 __b) return __a; } -/// \brief Divides two 128-bit vectors of [4 x float]. +/// Divides two 128-bit vectors of [4 x float]. /// /// \headerfile <x86intrin.h> /// @@ -209,7 +210,7 @@ _mm_div_ps(__m128 __a, __m128 __b) return (__m128)((__v4sf)__a / (__v4sf)__b); } -/// \brief Calculates the square root of the value stored in the low-order bits +/// Calculates the square root of the value stored in the low-order bits /// of a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -224,11 +225,10 @@ _mm_div_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_sqrt_ss(__m128 __a) { - __m128 __c = __builtin_ia32_sqrtss((__v4sf)__a); - return (__m128) { __c[0], __a[1], __a[2], __a[3] }; + return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); } -/// \brief Calculates the square roots of the values stored in a 128-bit vector +/// Calculates the square roots of the values stored in a 128-bit vector /// of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -245,7 +245,7 @@ _mm_sqrt_ps(__m128 __a) return __builtin_ia32_sqrtps((__v4sf)__a); } -/// \brief Calculates the approximate reciprocal of the value stored in the +/// Calculates the approximate reciprocal of the value stored in the /// low-order bits of a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -260,11 +260,10 @@ _mm_sqrt_ps(__m128 __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ss(__m128 __a) { - __m128 __c = __builtin_ia32_rcpss((__v4sf)__a); - return (__m128) { __c[0], __a[1], __a[2], __a[3] }; + return (__m128)__builtin_ia32_rcpss((__v4sf)__a); } -/// \brief Calculates the approximate reciprocals of the values stored in a +/// Calculates the approximate reciprocals of the values stored in a /// 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -278,10 +277,10 @@ _mm_rcp_ss(__m128 __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rcp_ps(__m128 __a) { - return __builtin_ia32_rcpps((__v4sf)__a); + return (__m128)__builtin_ia32_rcpps((__v4sf)__a); } -/// \brief Calculates the approximate reciprocal of the square root of the value +/// Calculates the approximate reciprocal of the square root of the value /// stored in the low-order bits of a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -297,11 +296,10 @@ _mm_rcp_ps(__m128 __a) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_rsqrt_ss(__m128 __a) { - __m128 __c = __builtin_ia32_rsqrtss((__v4sf)__a); - return (__m128) { __c[0], __a[1], __a[2], __a[3] }; + return __builtin_ia32_rsqrtss((__v4sf)__a); } -/// \brief Calculates the approximate reciprocals of the square roots of the +/// Calculates the approximate reciprocals of the square roots of the /// values stored in a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -318,7 +316,7 @@ _mm_rsqrt_ps(__m128 __a) return __builtin_ia32_rsqrtps((__v4sf)__a); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands and returns the lesser value in the low-order bits of the /// vector of [4 x float]. /// @@ -341,7 +339,7 @@ _mm_min_ss(__m128 __a, __m128 __b) return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 128-bit vectors of [4 x float] and returns the lesser +/// Compares two 128-bit vectors of [4 x float] and returns the lesser /// of each pair of values. /// /// \headerfile <x86intrin.h> @@ -360,7 +358,7 @@ _mm_min_ps(__m128 __a, __m128 __b) return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands and returns the greater value in the low-order bits of a 128-bit /// vector of [4 x float]. /// @@ -383,7 +381,7 @@ _mm_max_ss(__m128 __a, __m128 __b) return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 128-bit vectors of [4 x float] and returns the greater +/// Compares two 128-bit vectors of [4 x float] and returns the greater /// of each pair of values. /// /// \headerfile <x86intrin.h> @@ -402,7 +400,7 @@ _mm_max_ps(__m128 __a, __m128 __b) return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float]. +/// Performs a bitwise AND of two 128-bit vectors of [4 x float]. /// /// \headerfile <x86intrin.h> /// @@ -420,7 +418,7 @@ _mm_and_ps(__m128 __a, __m128 __b) return (__m128)((__v4su)__a & (__v4su)__b); } -/// \brief Performs a bitwise AND of two 128-bit vectors of [4 x float], using +/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using /// the one's complement of the values contained in the first source /// operand. /// @@ -442,7 +440,7 @@ _mm_andnot_ps(__m128 __a, __m128 __b) return (__m128)(~(__v4su)__a & (__v4su)__b); } -/// \brief Performs a bitwise OR of two 128-bit vectors of [4 x float]. +/// Performs a bitwise OR of two 128-bit vectors of [4 x float]. /// /// \headerfile <x86intrin.h> /// @@ -460,7 +458,7 @@ _mm_or_ps(__m128 __a, __m128 __b) return (__m128)((__v4su)__a | (__v4su)__b); } -/// \brief Performs a bitwise exclusive OR of two 128-bit vectors of +/// Performs a bitwise exclusive OR of two 128-bit vectors of /// [4 x float]. /// /// \headerfile <x86intrin.h> @@ -479,7 +477,7 @@ _mm_xor_ps(__m128 __a, __m128 __b) return (__m128)((__v4su)__a ^ (__v4su)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands for equality and returns the result of the comparison in the /// low-order bits of a vector [4 x float]. /// @@ -501,7 +499,7 @@ _mm_cmpeq_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] for equality. /// /// \headerfile <x86intrin.h> @@ -519,7 +517,7 @@ _mm_cmpeq_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is less than the /// corresponding value in the second operand and returns the result of the /// comparison in the low-order bits of a vector of [4 x float]. @@ -542,7 +540,7 @@ _mm_cmplt_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are less than those in the second operand. /// @@ -561,7 +559,7 @@ _mm_cmplt_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is less than or /// equal to the corresponding value in the second operand and returns the /// result of the comparison in the low-order bits of a vector of @@ -585,7 +583,7 @@ _mm_cmple_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are less than or equal to those in the second operand. /// @@ -604,7 +602,7 @@ _mm_cmple_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is greater than /// the corresponding value in the second operand and returns the result of /// the comparison in the low-order bits of a vector of [4 x float]. @@ -629,7 +627,7 @@ _mm_cmpgt_ss(__m128 __a, __m128 __b) 4, 1, 2, 3); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are greater than those in the second operand. /// @@ -648,7 +646,7 @@ _mm_cmpgt_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is greater than /// or equal to the corresponding value in the second operand and returns /// the result of the comparison in the low-order bits of a vector of @@ -674,7 +672,7 @@ _mm_cmpge_ss(__m128 __a, __m128 __b) 4, 1, 2, 3); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are greater than or equal to those in the second operand. /// @@ -693,7 +691,7 @@ _mm_cmpge_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands for inequality and returns the result of the comparison in the /// low-order bits of a vector of [4 x float]. /// @@ -716,7 +714,7 @@ _mm_cmpneq_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] for inequality. /// /// \headerfile <x86intrin.h> @@ -735,7 +733,7 @@ _mm_cmpneq_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is not less than /// the corresponding value in the second operand and returns the result of /// the comparison in the low-order bits of a vector of [4 x float]. @@ -759,7 +757,7 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are not less than those in the second operand. /// @@ -779,7 +777,7 @@ _mm_cmpnlt_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is not less than /// or equal to the corresponding value in the second operand and returns /// the result of the comparison in the low-order bits of a vector of @@ -804,7 +802,7 @@ _mm_cmpnle_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are not less than or equal to those in the second operand. /// @@ -824,7 +822,7 @@ _mm_cmpnle_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is not greater /// than the corresponding value in the second operand and returns the /// result of the comparison in the low-order bits of a vector of @@ -851,7 +849,7 @@ _mm_cmpngt_ss(__m128 __a, __m128 __b) 4, 1, 2, 3); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are not greater than those in the second operand. /// @@ -871,7 +869,7 @@ _mm_cmpngt_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is not greater /// than or equal to the corresponding value in the second operand and /// returns the result of the comparison in the low-order bits of a vector @@ -898,7 +896,7 @@ _mm_cmpnge_ss(__m128 __a, __m128 __b) 4, 1, 2, 3); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are not greater than or equal to those in the second operand. /// @@ -918,7 +916,7 @@ _mm_cmpnge_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is ordered with /// respect to the corresponding value in the second operand and returns the /// result of the comparison in the low-order bits of a vector of @@ -943,7 +941,7 @@ _mm_cmpord_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are ordered with respect to those in the second operand. /// @@ -963,7 +961,7 @@ _mm_cmpord_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the value in the first operand is unordered /// with respect to the corresponding value in the second operand and /// returns the result of the comparison in the low-order bits of a vector @@ -988,7 +986,7 @@ _mm_cmpunord_ss(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares each of the corresponding 32-bit float values of the +/// Compares each of the corresponding 32-bit float values of the /// 128-bit vectors of [4 x float] to determine if the values in the first /// operand are unordered with respect to those in the second operand. /// @@ -1008,9 +1006,11 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b) return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands for equality and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> @@ -1022,17 +1022,20 @@ _mm_cmpunord_ps(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the +/// two lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the first operand is less than the second /// operand and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> @@ -1044,17 +1047,20 @@ _mm_comieq_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the first operand is less than or equal to the /// second operand and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. @@ -1065,17 +1071,20 @@ _mm_comilt_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the first operand is greater than the second /// operand and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. @@ -1086,17 +1095,20 @@ _mm_comile_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the +/// two lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the first operand is greater than or equal to /// the second operand and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. @@ -1107,17 +1119,20 @@ _mm_comigt_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); } -/// \brief Compares two 32-bit float values in the low-order bits of both +/// Compares two 32-bit float values in the low-order bits of both /// operands to determine if the first operand is not equal to the second /// operand and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 1 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions. @@ -1128,17 +1143,20 @@ _mm_comige_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the +/// two lower 32-bit values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs an unordered comparison of two 32-bit float values using +/// Performs an unordered comparison of two 32-bit float values using /// the low-order bits of both operands to determine equality and returns /// the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. @@ -1149,17 +1167,20 @@ _mm_comineq_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs an unordered comparison of two 32-bit float values using +/// Performs an unordered comparison of two 32-bit float values using /// the low-order bits of both operands to determine if the first operand is /// less than the second operand and returns the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. @@ -1170,18 +1191,21 @@ _mm_ucomieq_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs an unordered comparison of two 32-bit float values using +/// Performs an unordered comparison of two 32-bit float values using /// the low-order bits of both operands to determine if the first operand is /// less than or equal to the second operand and returns the result of the /// comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. @@ -1192,18 +1216,21 @@ _mm_ucomilt_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs an unordered comparison of two 32-bit float values using +/// Performs an unordered comparison of two 32-bit float values using /// the low-order bits of both operands to determine if the first operand is /// greater than the second operand and returns the result of the /// comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. @@ -1214,18 +1241,21 @@ _mm_ucomile_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs an unordered comparison of two 32-bit float values using +/// Performs an unordered comparison of two 32-bit float values using /// the low-order bits of both operands to determine if the first operand is /// greater than or equal to the second operand and returns the result of /// the comparison. /// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. @@ -1236,17 +1266,20 @@ _mm_ucomigt_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); } -/// \brief Performs an unordered comparison of two 32-bit float values using +/// Performs an unordered comparison of two 32-bit float values using /// the low-order bits of both operands to determine inequality and returns /// the result of the comparison. /// +/// If either of the two lower 32-bit values is NaN, 1 is returned. +/// /// \headerfile <x86intrin.h> /// /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions. @@ -1257,14 +1290,15 @@ _mm_ucomige_ss(__m128 __a, __m128 __b) /// \param __b /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are /// used in the comparison. -/// \returns An integer containing the comparison results. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 1 is returned. static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_ss(__m128 __a, __m128 __b) { return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); } -/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 32-bit integer. /// /// \headerfile <x86intrin.h> @@ -1282,7 +1316,7 @@ _mm_cvtss_si32(__m128 __a) return __builtin_ia32_cvtss2si((__v4sf)__a); } -/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 32-bit integer. /// /// \headerfile <x86intrin.h> @@ -1302,7 +1336,7 @@ _mm_cvt_ss2si(__m128 __a) #ifdef __x86_64__ -/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 64-bit integer. /// /// \headerfile <x86intrin.h> @@ -1322,7 +1356,7 @@ _mm_cvtss_si64(__m128 __a) #endif -/// \brief Converts two low-order float values in a 128-bit vector of +/// Converts two low-order float values in a 128-bit vector of /// [4 x float] into a 64-bit vector of [2 x i32]. /// /// \headerfile <x86intrin.h> @@ -1332,13 +1366,13 @@ _mm_cvtss_si64(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi32(__m128 __a) { return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); } -/// \brief Converts two low-order float values in a 128-bit vector of +/// Converts two low-order float values in a 128-bit vector of /// [4 x float] into a 64-bit vector of [2 x i32]. /// /// \headerfile <x86intrin.h> @@ -1348,13 +1382,13 @@ _mm_cvtps_pi32(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvt_ps2pi(__m128 __a) { return _mm_cvtps_pi32(__a); } -/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 32-bit integer, truncating the result when it is /// inexact. /// @@ -1373,7 +1407,7 @@ _mm_cvttss_si32(__m128 __a) return __builtin_ia32_cvttss2si((__v4sf)__a); } -/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 32-bit integer, truncating the result when it is /// inexact. /// @@ -1393,7 +1427,7 @@ _mm_cvtt_ss2si(__m128 __a) } #ifdef __x86_64__ -/// \brief Converts a float value contained in the lower 32 bits of a vector of +/// Converts a float value contained in the lower 32 bits of a vector of /// [4 x float] into a 64-bit integer, truncating the result when it is /// inexact. /// @@ -1413,7 +1447,7 @@ _mm_cvttss_si64(__m128 __a) } #endif -/// \brief Converts two low-order float values in a 128-bit vector of +/// Converts two low-order float values in a 128-bit vector of /// [4 x float] into a 64-bit vector of [2 x i32], truncating the result /// when it is inexact. /// @@ -1425,13 +1459,13 @@ _mm_cvttss_si64(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttps_pi32(__m128 __a) { return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); } -/// \brief Converts two low-order float values in a 128-bit vector of [4 x +/// Converts two low-order float values in a 128-bit vector of [4 x /// float] into a 64-bit vector of [2 x i32], truncating the result when it /// is inexact. /// @@ -1442,13 +1476,13 @@ _mm_cvttps_pi32(__m128 __a) /// \param __a /// A 128-bit vector of [4 x float]. /// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtt_ps2pi(__m128 __a) { return _mm_cvttps_pi32(__a); } -/// \brief Converts a 32-bit signed integer value into a floating point value +/// Converts a 32-bit signed integer value into a floating point value /// and writes it to the lower 32 bits of the destination. The remaining /// higher order elements of the destination vector are copied from the /// corresponding elements in the first operand. @@ -1471,7 +1505,7 @@ _mm_cvtsi32_ss(__m128 __a, int __b) return __a; } -/// \brief Converts a 32-bit signed integer value into a floating point value +/// Converts a 32-bit signed integer value into a floating point value /// and writes it to the lower 32 bits of the destination. The remaining /// higher order elements of the destination are copied from the /// corresponding elements in the first operand. @@ -1495,7 +1529,7 @@ _mm_cvt_si2ss(__m128 __a, int __b) #ifdef __x86_64__ -/// \brief Converts a 64-bit signed integer value into a floating point value +/// Converts a 64-bit signed integer value into a floating point value /// and writes it to the lower 32 bits of the destination. The remaining /// higher order elements of the destination are copied from the /// corresponding elements in the first operand. @@ -1520,7 +1554,7 @@ _mm_cvtsi64_ss(__m128 __a, long long __b) #endif -/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two +/// Converts two elements of a 64-bit vector of [2 x i32] into two /// floating point values and writes them to the lower 64-bits of the /// destination. The remaining higher order elements of the destination are /// copied from the corresponding elements in the first operand. @@ -1537,13 +1571,13 @@ _mm_cvtsi64_ss(__m128 __a, long long __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value of the second operand. The upper 64 bits are copied from /// the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_ps(__m128 __a, __m64 __b) { return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); } -/// \brief Converts two elements of a 64-bit vector of [2 x i32] into two +/// Converts two elements of a 64-bit vector of [2 x i32] into two /// floating point values and writes them to the lower 64-bits of the /// destination. The remaining higher order elements of the destination are /// copied from the corresponding elements in the first operand. @@ -1560,18 +1594,18 @@ _mm_cvtpi32_ps(__m128 __a, __m64 __b) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// converted value from the second operand. The upper 64 bits are copied /// from the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvt_pi2ps(__m128 __a, __m64 __b) { return _mm_cvtpi32_ps(__a, __b); } -/// \brief Extracts a float value contained in the lower 32 bits of a vector of +/// Extracts a float value contained in the lower 32 bits of a vector of /// [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. +/// This intrinsic has no corresponding instruction. /// /// \param __a /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are @@ -1583,7 +1617,7 @@ _mm_cvtss_f32(__m128 __a) return __a[0]; } -/// \brief Loads two packed float values from the address \a __p into the +/// Loads two packed float values from the address \a __p into the /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits /// are copied from the low-order bits of the first operand. /// @@ -1610,7 +1644,7 @@ _mm_loadh_pi(__m128 __a, const __m64 *__p) return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); } -/// \brief Loads two packed float values from the address \a __p into the +/// Loads two packed float values from the address \a __p into the /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits /// are copied from the high-order bits of the first operand. /// @@ -1637,7 +1671,7 @@ _mm_loadl_pi(__m128 __a, const __m64 *__p) return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); } -/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower /// 32 bits of the vector are initialized with the single-precision /// floating-point value loaded from a specified memory location. The upper /// 96 bits are set to zero. @@ -1659,15 +1693,15 @@ _mm_load_ss(const float *__p) float __u; } __attribute__((__packed__, __may_alias__)); float __u = ((struct __mm_load_ss_struct*)__p)->__u; - return (__m128){ __u, 0, 0, 0 }; + return __extension__ (__m128){ __u, 0, 0, 0 }; } -/// \brief Loads a 32-bit float value and duplicates it to all four vector +/// Loads a 32-bit float value and duplicates it to all four vector /// elements of a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVSS / MOVSS + shuffling </c> +/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c> /// instruction. /// /// \param __p @@ -1681,12 +1715,12 @@ _mm_load1_ps(const float *__p) float __u; } __attribute__((__packed__, __may_alias__)); float __u = ((struct __mm_load1_ps_struct*)__p)->__u; - return (__m128){ __u, __u, __u, __u }; + return __extension__ (__m128){ __u, __u, __u, __u }; } #define _mm_load_ps1(p) _mm_load1_ps(p) -/// \brief Loads a 128-bit floating-point vector of [4 x float] from an aligned +/// Loads a 128-bit floating-point vector of [4 x float] from an aligned /// memory location. /// /// \headerfile <x86intrin.h> @@ -1696,14 +1730,14 @@ _mm_load1_ps(const float *__p) /// \param __p /// A pointer to a 128-bit memory location. The address of the memory /// location has to be 128-bit aligned. -/// \returns A 128-bit vector of [4 x float] containing the loaded valus. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_load_ps(const float *__p) { return *(__m128*)__p; } -/// \brief Loads a 128-bit floating-point vector of [4 x float] from an +/// Loads a 128-bit floating-point vector of [4 x float] from an /// unaligned memory location. /// /// \headerfile <x86intrin.h> @@ -1723,7 +1757,7 @@ _mm_loadu_ps(const float *__p) return ((struct __loadu_ps*)__p)->__v; } -/// \brief Loads four packed float values, in reverse order, from an aligned +/// Loads four packed float values, in reverse order, from an aligned /// memory location to 32-bit elements in a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -1743,7 +1777,7 @@ _mm_loadr_ps(const float *__p) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); } -/// \brief Create a 128-bit vector of [4 x float] with undefined values. +/// Create a 128-bit vector of [4 x float] with undefined values. /// /// \headerfile <x86intrin.h> /// @@ -1756,7 +1790,7 @@ _mm_undefined_ps(void) return (__m128)__builtin_ia32_undef128(); } -/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower /// 32 bits of the vector are initialized with the specified single-precision /// floating-point value. The upper 96 bits are set to zero. /// @@ -1773,10 +1807,10 @@ _mm_undefined_ps(void) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ss(float __w) { - return (__m128){ __w, 0, 0, 0 }; + return __extension__ (__m128){ __w, 0, 0, 0 }; } -/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each +/// Constructs a 128-bit floating-point vector of [4 x float], with each /// of the four single-precision floating-point vector elements set to the /// specified single-precision floating-point value. /// @@ -1791,11 +1825,11 @@ _mm_set_ss(float __w) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set1_ps(float __w) { - return (__m128){ __w, __w, __w, __w }; + return __extension__ (__m128){ __w, __w, __w, __w }; } /* Microsoft specific. */ -/// \brief Constructs a 128-bit floating-point vector of [4 x float], with each +/// Constructs a 128-bit floating-point vector of [4 x float], with each /// of the four single-precision floating-point vector elements set to the /// specified single-precision floating-point value. /// @@ -1813,7 +1847,7 @@ _mm_set_ps1(float __w) return _mm_set1_ps(__w); } -/// \brief Constructs a 128-bit floating-point vector of [4 x float] +/// Constructs a 128-bit floating-point vector of [4 x float] /// initialized with the specified single-precision floating-point values. /// /// \headerfile <x86intrin.h> @@ -1837,10 +1871,10 @@ _mm_set_ps1(float __w) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_set_ps(float __z, float __y, float __x, float __w) { - return (__m128){ __w, __x, __y, __z }; + return __extension__ (__m128){ __w, __x, __y, __z }; } -/// \brief Constructs a 128-bit floating-point vector of [4 x float], +/// Constructs a 128-bit floating-point vector of [4 x float], /// initialized in reverse order with the specified 32-bit single-precision /// float-point values. /// @@ -1865,10 +1899,10 @@ _mm_set_ps(float __z, float __y, float __x, float __w) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setr_ps(float __z, float __y, float __x, float __w) { - return (__m128){ __z, __y, __x, __w }; + return __extension__ (__m128){ __z, __y, __x, __w }; } -/// \brief Constructs a 128-bit floating-point vector of [4 x float] initialized +/// Constructs a 128-bit floating-point vector of [4 x float] initialized /// to zero. /// /// \headerfile <x86intrin.h> @@ -1880,15 +1914,15 @@ _mm_setr_ps(float __z, float __y, float __x, float __w) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_setzero_ps(void) { - return (__m128){ 0, 0, 0, 0 }; + return __extension__ (__m128){ 0, 0, 0, 0 }; } -/// \brief Stores the upper 64 bits of a 128-bit vector of [4 x float] to a +/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a /// memory location. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VPEXTRQ / MOVQ </c> instruction. +/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. /// /// \param __p /// A pointer to a 64-bit memory location. @@ -1900,7 +1934,7 @@ _mm_storeh_pi(__m64 *__p, __m128 __a) __builtin_ia32_storehps((__v2si *)__p, (__v4sf)__a); } -/// \brief Stores the lower 64 bits of a 128-bit vector of [4 x float] to a +/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a /// memory location. /// /// \headerfile <x86intrin.h> @@ -1917,7 +1951,7 @@ _mm_storel_pi(__m64 *__p, __m128 __a) __builtin_ia32_storelps((__v2si *)__p, (__v4sf)__a); } -/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] to a +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a /// memory location. /// /// \headerfile <x86intrin.h> @@ -1937,7 +1971,7 @@ _mm_store_ss(float *__p, __m128 __a) ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; } -/// \brief Stores a 128-bit vector of [4 x float] to an unaligned memory +/// Stores a 128-bit vector of [4 x float] to an unaligned memory /// location. /// /// \headerfile <x86intrin.h> @@ -1958,7 +1992,7 @@ _mm_storeu_ps(float *__p, __m128 __a) ((struct __storeu_ps*)__p)->__v = __a; } -/// \brief Stores a 128-bit vector of [4 x float] into an aligned memory +/// Stores a 128-bit vector of [4 x float] into an aligned memory /// location. /// /// \headerfile <x86intrin.h> @@ -1976,7 +2010,7 @@ _mm_store_ps(float *__p, __m128 __a) *(__m128*)__p = __a; } -/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into /// four contiguous elements in an aligned memory location. /// /// \headerfile <x86intrin.h> @@ -1996,7 +2030,7 @@ _mm_store1_ps(float *__p, __m128 __a) _mm_store_ps(__p, __a); } -/// \brief Stores the lower 32 bits of a 128-bit vector of [4 x float] into +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into /// four contiguous elements in an aligned memory location. /// /// \headerfile <x86intrin.h> @@ -2012,10 +2046,10 @@ _mm_store1_ps(float *__p, __m128 __a) static __inline__ void __DEFAULT_FN_ATTRS _mm_store_ps1(float *__p, __m128 __a) { - return _mm_store1_ps(__p, __a); + _mm_store1_ps(__p, __a); } -/// \brief Stores float values from a 128-bit vector of [4 x float] to an +/// Stores float values from a 128-bit vector of [4 x float] to an /// aligned memory location in reverse order. /// /// \headerfile <x86intrin.h> @@ -2046,7 +2080,7 @@ _mm_storer_ps(float *__p, __m128 __a) /* FIXME: We have to #define this because "sel" must be a constant integer, and Sema doesn't do any form of constant propagation yet. */ -/// \brief Loads one cache line of data from the specified address to a location +/// Loads one cache line of data from the specified address to a location /// closer to the processor. /// /// \headerfile <x86intrin.h> @@ -2074,7 +2108,7 @@ _mm_storer_ps(float *__p, __m128 __a) ((sel) >> 2) & 1, (sel) & 0x3)) #endif -/// \brief Stores a 64-bit integer in the specified aligned memory location. To +/// Stores a 64-bit integer in the specified aligned memory location. To /// minimize caching, the data is flagged as non-temporal (unlikely to be /// used again soon). /// @@ -2086,13 +2120,13 @@ _mm_storer_ps(float *__p, __m128 __a) /// A pointer to an aligned memory location used to store the register value. /// \param __a /// A 64-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_stream_pi(__m64 *__p, __m64 __a) { __builtin_ia32_movntq(__p, __a); } -/// \brief Moves packed float values from a 128-bit vector of [4 x float] to a +/// Moves packed float values from a 128-bit vector of [4 x float] to a /// 128-bit aligned memory location. To minimize caching, the data is flagged /// as non-temporal (unlikely to be used again soon). /// @@ -2115,7 +2149,7 @@ _mm_stream_ps(float *__p, __m128 __a) extern "C" { #endif -/// \brief Forces strong memory ordering (serialization) between store +/// Forces strong memory ordering (serialization) between store /// instructions preceding this instruction and store instructions following /// this instruction, ensuring the system completes all previous stores /// before executing subsequent stores. @@ -2130,7 +2164,7 @@ void _mm_sfence(void); } // extern "C" #endif -/// \brief Extracts 16-bit element from a 64-bit vector of [4 x i16] and +/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and /// returns it, as specified by the immediate integer operand. /// /// \headerfile <x86intrin.h> @@ -2150,10 +2184,10 @@ void _mm_sfence(void); /// 2: Bits [47:32] are copied to the destination. \n /// 3: Bits [63:48] are copied to the destination. /// \returns A 16-bit integer containing the extracted 16 bits of packed data. -#define _mm_extract_pi16(a, n) __extension__ ({ \ - (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n); }) +#define _mm_extract_pi16(a, n) \ + (int)__builtin_ia32_vec_ext_v4hi((__m64)a, (int)n) -/// \brief Copies data from the 64-bit vector of [4 x i16] to the destination, +/// Copies data from the 64-bit vector of [4 x i16] to the destination, /// and inserts the lower 16-bits of an integer operand at the 16-bit offset /// specified by the immediate operand \a n. /// @@ -2163,7 +2197,7 @@ void _mm_sfence(void); /// __m64 _mm_insert_pi16(__m64 a, int d, int n); /// \endcode /// -/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. +/// This intrinsic corresponds to the <c> PINSRW </c> instruction. /// /// \param a /// A 64-bit vector of [4 x i16]. @@ -2181,10 +2215,10 @@ void _mm_sfence(void); /// bits in operand \a a. /// \returns A 64-bit integer vector containing the copied packed data from the /// operands. -#define _mm_insert_pi16(a, d, n) __extension__ ({ \ - (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n); }) +#define _mm_insert_pi16(a, d, n) \ + (__m64)__builtin_ia32_vec_set_v4hi((__m64)a, (int)d, (int)n) -/// \brief Compares each of the corresponding packed 16-bit integer values of +/// Compares each of the corresponding packed 16-bit integer values of /// the 64-bit integer vectors, and writes the greater value to the /// corresponding bits in the destination. /// @@ -2197,13 +2231,13 @@ void _mm_sfence(void); /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); } -/// \brief Compares each of the corresponding packed 8-bit unsigned integer +/// Compares each of the corresponding packed 8-bit unsigned integer /// values of the 64-bit integer vectors, and writes the greater value to the /// corresponding bits in the destination. /// @@ -2216,13 +2250,13 @@ _mm_max_pi16(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_max_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); } -/// \brief Compares each of the corresponding packed 16-bit integer values of +/// Compares each of the corresponding packed 16-bit integer values of /// the 64-bit integer vectors, and writes the lesser value to the /// corresponding bits in the destination. /// @@ -2235,13 +2269,13 @@ _mm_max_pu8(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pi16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); } -/// \brief Compares each of the corresponding packed 8-bit unsigned integer +/// Compares each of the corresponding packed 8-bit unsigned integer /// values of the 64-bit integer vectors, and writes the lesser value to the /// corresponding bits in the destination. /// @@ -2254,14 +2288,14 @@ _mm_min_pi16(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_min_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); } -/// \brief Takes the most significant bit from each 8-bit element in a 64-bit -/// integer vector to create a 16-bit mask value. Zero-extends the value to +/// Takes the most significant bit from each 8-bit element in a 64-bit +/// integer vector to create an 8-bit mask value. Zero-extends the value to /// 32-bit integer and writes it to the destination. /// /// \headerfile <x86intrin.h> @@ -2270,15 +2304,15 @@ _mm_min_pu8(__m64 __a, __m64 __b) /// /// \param __a /// A 64-bit integer vector containing the values with bits to be extracted. -/// \returns The most significant bit from each 8-bit element in the operand, -/// written to bits [15:0]. -static __inline__ int __DEFAULT_FN_ATTRS +/// \returns The most significant bit from each 8-bit element in \a __a, +/// written to bits [7:0]. +static __inline__ int __DEFAULT_FN_ATTRS_MMX _mm_movemask_pi8(__m64 __a) { return __builtin_ia32_pmovmskb((__v8qi)__a); } -/// \brief Multiplies packed 16-bit unsigned integer values and writes the +/// Multiplies packed 16-bit unsigned integer values and writes the /// high-order 16 bits of each 32-bit product to the corresponding bits in /// the destination. /// @@ -2291,13 +2325,13 @@ _mm_movemask_pi8(__m64 __a) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mulhi_pu16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); } -/// \brief Shuffles the 4 16-bit integers from a 64-bit integer vector to the +/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the /// destination, as specified by the immediate value operand. /// /// \headerfile <x86intrin.h> @@ -2328,10 +2362,10 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// 10: assigned from bits [47:32] of \a a. \n /// 11: assigned from bits [63:48] of \a a. /// \returns A 64-bit integer vector containing the shuffled values. -#define _mm_shuffle_pi16(a, n) __extension__ ({ \ - (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)); }) +#define _mm_shuffle_pi16(a, n) \ + (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)) -/// \brief Conditionally copies the values from each 8-bit element in the first +/// Conditionally copies the values from each 8-bit element in the first /// 64-bit integer vector operand to the specified memory location, as /// specified by the most significant bit in the corresponding element in the /// second 64-bit integer vector operand. @@ -2354,13 +2388,13 @@ _mm_mulhi_pu16(__m64 __a, __m64 __b) /// A pointer to a 64-bit memory location that will receive the conditionally /// copied integer values. The address of the memory location does not have /// to be aligned. -static __inline__ void __DEFAULT_FN_ATTRS +static __inline__ void __DEFAULT_FN_ATTRS_MMX _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) { __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); } -/// \brief Computes the rounded averages of the packed unsigned 8-bit integer +/// Computes the rounded averages of the packed unsigned 8-bit integer /// values and writes the averages to the corresponding bits in the /// destination. /// @@ -2373,13 +2407,13 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); } -/// \brief Computes the rounded averages of the packed unsigned 16-bit integer +/// Computes the rounded averages of the packed unsigned 16-bit integer /// values and writes the averages to the corresponding bits in the /// destination. /// @@ -2392,13 +2426,13 @@ _mm_avg_pu8(__m64 __a, __m64 __b) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_avg_pu16(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); } -/// \brief Subtracts the corresponding 8-bit unsigned integer values of the two +/// Subtracts the corresponding 8-bit unsigned integer values of the two /// 64-bit vector operands and computes the absolute value for each of the /// difference. Then sum of the 8 absolute differences is written to the /// bits [15:0] of the destination; the remaining bits [63:16] are cleared. @@ -2414,7 +2448,7 @@ _mm_avg_pu16(__m64 __a, __m64 __b) /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the /// sets of absolute differences between both operands. The upper bits are /// cleared. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sad_pu8(__m64 __a, __m64 __b) { return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); @@ -2424,7 +2458,7 @@ _mm_sad_pu8(__m64 __a, __m64 __b) extern "C" { #endif -/// \brief Returns the contents of the MXCSR register as a 32-bit unsigned +/// Returns the contents of the MXCSR register as a 32-bit unsigned /// integer value. /// /// There are several groups of macros associated with this @@ -2444,7 +2478,7 @@ extern "C" { /// <li> /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper -/// _MM_GET_ROUNDING_MODE(x) where x is one of these macros. +/// _MM_GET_ROUNDING_MODE(). /// </li> /// <li> /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. @@ -2457,12 +2491,16 @@ extern "C" { /// </li> /// </ul> /// -/// For example, the expression below checks if an overflow exception has +/// For example, the following expression checks if an overflow exception has /// occurred: +/// \code /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) +/// \endcode /// -/// The following example gets the current rounding mode: +/// The following expression gets the current rounding mode: +/// \code /// _MM_GET_ROUNDING_MODE() +/// \endcode /// /// \headerfile <x86intrin.h> /// @@ -2472,7 +2510,7 @@ extern "C" { /// register. unsigned int _mm_getcsr(void); -/// \brief Sets the MXCSR register with the 32-bit unsigned integer value. +/// Sets the MXCSR register with the 32-bit unsigned integer value. /// /// There are several groups of macros associated with this intrinsic, /// including: @@ -2511,10 +2549,12 @@ unsigned int _mm_getcsr(void); /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) /// /// The following example sets the DAZ and FTZ flags: -/// void setFlags() { -/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON) -/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON) -/// } +/// \code +/// void setFlags() { +/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); +/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +/// } +/// \endcode /// /// \headerfile <x86intrin.h> /// @@ -2528,7 +2568,7 @@ void _mm_setcsr(unsigned int __i); } // extern "C" #endif -/// \brief Selects 4 float values from the 128-bit operands of [4 x float], as +/// Selects 4 float values from the 128-bit operands of [4 x float], as /// specified by the immediate value operand. /// /// \headerfile <x86intrin.h> @@ -2564,14 +2604,11 @@ void _mm_setcsr(unsigned int __i); /// 10: Bits [95:64] copied from the specified operand. \n /// 11: Bits [127:96] copied from the specified operand. /// \returns A 128-bit vector of [4 x float] containing the shuffled values. -#define _mm_shuffle_ps(a, b, mask) __extension__ ({ \ - (__m128)__builtin_shufflevector((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ - 0 + (((mask) >> 0) & 0x3), \ - 0 + (((mask) >> 2) & 0x3), \ - 4 + (((mask) >> 4) & 0x3), \ - 4 + (((mask) >> 6) & 0x3)); }) - -/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of +#define _mm_shuffle_ps(a, b, mask) \ + (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ + (int)(mask)) + +/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of /// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2593,7 +2630,7 @@ _mm_unpackhi_ps(__m128 __a, __m128 __b) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); } -/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of /// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2615,13 +2652,14 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); } -/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower /// 32 bits are set to the lower 32 bits of the second parameter. The upper /// 96 bits are set to the upper 96 bits of the first parameter. /// /// \headerfile <x86intrin.h> /// -/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction. +/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c> +/// instruction. /// /// \param __a /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are @@ -2633,10 +2671,11 @@ _mm_unpacklo_ps(__m128 __a, __m128 __b) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_move_ss(__m128 __a, __m128 __b) { - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 4, 1, 2, 3); + __a[0] = __b[0]; + return __a; } -/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower /// 64 bits are set to the upper 64 bits of the second parameter. The upper /// 64 bits are set to the upper 64 bits of the first parameter. /// @@ -2657,7 +2696,7 @@ _mm_movehl_ps(__m128 __a, __m128 __b) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); } -/// \brief Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower /// 64 bits are set to the lower 64 bits of the first parameter. The upper /// 64 bits are set to the lower 64 bits of the second parameter. /// @@ -2678,7 +2717,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b) return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); } -/// \brief Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x +/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x /// float]. /// /// \headerfile <x86intrin.h> @@ -2690,7 +2729,7 @@ _mm_movelh_ps(__m128 __a, __m128 __b) /// from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi16_ps(__m64 __a) { __m64 __b, __c; @@ -2708,7 +2747,7 @@ _mm_cvtpi16_ps(__m64 __a) return __r; } -/// \brief Converts a 64-bit vector of 16-bit unsigned integer values into a +/// Converts a 64-bit vector of 16-bit unsigned integer values into a /// 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2720,7 +2759,7 @@ _mm_cvtpi16_ps(__m64 __a) /// destination are copied from the corresponding elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu16_ps(__m64 __a) { __m64 __b, __c; @@ -2737,7 +2776,7 @@ _mm_cvtpu16_ps(__m64 __a) return __r; } -/// \brief Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] +/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] /// into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2749,7 +2788,7 @@ _mm_cvtpu16_ps(__m64 __a) /// from the corresponding lower 4 elements in this operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi8_ps(__m64 __a) { __m64 __b; @@ -2761,7 +2800,7 @@ _mm_cvtpi8_ps(__m64 __a) return _mm_cvtpi16_ps(__b); } -/// \brief Converts the lower four unsigned 8-bit integer values from a 64-bit +/// Converts the lower four unsigned 8-bit integer values from a 64-bit /// vector of [8 x u8] into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2774,7 +2813,7 @@ _mm_cvtpi8_ps(__m64 __a) /// operand. /// \returns A 128-bit vector of [4 x float] containing the copied and converted /// values from the source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpu8_ps(__m64 __a) { __m64 __b; @@ -2785,7 +2824,7 @@ _mm_cvtpu8_ps(__m64 __a) return _mm_cvtpi16_ps(__b); } -/// \brief Converts the two 32-bit signed integer values from each 64-bit vector +/// Converts the two 32-bit signed integer values from each 64-bit vector /// operand of [2 x i32] into a 128-bit vector of [4 x float]. /// /// \headerfile <x86intrin.h> @@ -2801,7 +2840,7 @@ _mm_cvtpu8_ps(__m64 __a) /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the /// copied and converted values from the first operand. The upper 64 bits /// contain the copied and converted values from the second operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) { __m128 __c; @@ -2813,7 +2852,7 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) return _mm_cvtpi32_ps(__c, __a); } -/// \brief Converts each single-precision floating-point element of a 128-bit +/// Converts each single-precision floating-point element of a 128-bit /// floating-point vector of [4 x float] into a 16-bit signed integer, and /// packs the results into a 64-bit integer vector of [4 x i16]. /// @@ -2830,7 +2869,7 @@ _mm_cvtpi32x2_ps(__m64 __a, __m64 __b) /// A 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [4 x i16] containing the converted /// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi16(__m128 __a) { __m64 __b, __c; @@ -2842,7 +2881,7 @@ _mm_cvtps_pi16(__m128 __a) return _mm_packs_pi32(__b, __c); } -/// \brief Converts each single-precision floating-point element of a 128-bit +/// Converts each single-precision floating-point element of a 128-bit /// floating-point vector of [4 x float] into an 8-bit signed integer, and /// packs the results into the lower 32 bits of a 64-bit integer vector of /// [8 x i8]. The upper 32 bits of the vector are set to 0. @@ -2860,7 +2899,7 @@ _mm_cvtps_pi16(__m128 __a) /// 128-bit floating-point vector of [4 x float]. /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the /// converted values and the uppper 32 bits are set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtps_pi8(__m128 __a) { __m64 __b, __c; @@ -2871,7 +2910,7 @@ _mm_cvtps_pi8(__m128 __a) return _mm_packs_pi16(__b, __c); } -/// \brief Extracts the sign bits from each single-precision floating-point +/// Extracts the sign bits from each single-precision floating-point /// element of a 128-bit floating-point vector of [4 x float] and returns the /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set /// to zero. @@ -2963,6 +3002,7 @@ do { \ #define _m_ _mm_ #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_MMX /* Ugly hack for backwards-compatibility (compatible with gcc) */ #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) diff --git a/c_headers/xopintrin.h b/c_headers/xopintrin.h index 4a34f770d5..9d540a2abd 100644 --- a/c_headers/xopintrin.h +++ b/c_headers/xopintrin.h @@ -31,7 +31,8 @@ #include <fma4intrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256))) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) @@ -201,7 +202,7 @@ _mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C)); } -static __inline__ __m256i __DEFAULT_FN_ATTRS +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) { return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C)); @@ -237,17 +238,17 @@ _mm_rot_epi64(__m128i __A, __m128i __B) return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); } -#define _mm_roti_epi8(A, N) __extension__ ({ \ - (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)); }) +#define _mm_roti_epi8(A, N) \ + (__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)) -#define _mm_roti_epi16(A, N) __extension__ ({ \ - (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)); }) +#define _mm_roti_epi16(A, N) \ + (__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)) -#define _mm_roti_epi32(A, N) __extension__ ({ \ - (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)); }) +#define _mm_roti_epi32(A, N) \ + (__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)) -#define _mm_roti_epi64(A, N) __extension__ ({ \ - (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)); }) +#define _mm_roti_epi64(A, N) \ + (__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_shl_epi8(__m128i __A, __m128i __B) @@ -297,37 +298,37 @@ _mm_sha_epi64(__m128i __A, __m128i __B) return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); } -#define _mm_com_epu8(A, B, N) __extension__ ({ \ +#define _mm_com_epu8(A, B, N) \ (__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (N)); }) + (__v16qi)(__m128i)(B), (N)) -#define _mm_com_epu16(A, B, N) __extension__ ({ \ +#define _mm_com_epu16(A, B, N) \ (__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (N)); }) + (__v8hi)(__m128i)(B), (N)) -#define _mm_com_epu32(A, B, N) __extension__ ({ \ +#define _mm_com_epu32(A, B, N) \ (__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (N)); }) + (__v4si)(__m128i)(B), (N)) -#define _mm_com_epu64(A, B, N) __extension__ ({ \ +#define _mm_com_epu64(A, B, N) \ (__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (N)); }) + (__v2di)(__m128i)(B), (N)) -#define _mm_com_epi8(A, B, N) __extension__ ({ \ +#define _mm_com_epi8(A, B, N) \ (__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (N)); }) + (__v16qi)(__m128i)(B), (N)) -#define _mm_com_epi16(A, B, N) __extension__ ({ \ +#define _mm_com_epi16(A, B, N) \ (__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (N)); }) + (__v8hi)(__m128i)(B), (N)) -#define _mm_com_epi32(A, B, N) __extension__ ({ \ +#define _mm_com_epi32(A, B, N) \ (__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (N)); }) + (__v4si)(__m128i)(B), (N)) -#define _mm_com_epi64(A, B, N) __extension__ ({ \ +#define _mm_com_epi64(A, B, N) \ (__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (N)); }) + (__v2di)(__m128i)(B), (N)) #define _MM_PCOMCTRL_LT 0 #define _MM_PCOMCTRL_LE 1 @@ -722,24 +723,24 @@ _mm_comtrue_epi64(__m128i __A, __m128i __B) return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE); } -#define _mm_permute2_pd(X, Y, C, I) __extension__ ({ \ +#define _mm_permute2_pd(X, Y, C, I) \ (__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ (__v2df)(__m128d)(Y), \ - (__v2di)(__m128i)(C), (I)); }) + (__v2di)(__m128i)(C), (I)) -#define _mm256_permute2_pd(X, Y, C, I) __extension__ ({ \ +#define _mm256_permute2_pd(X, Y, C, I) \ (__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ (__v4df)(__m256d)(Y), \ - (__v4di)(__m256i)(C), (I)); }) + (__v4di)(__m256i)(C), (I)) -#define _mm_permute2_ps(X, Y, C, I) __extension__ ({ \ +#define _mm_permute2_ps(X, Y, C, I) \ (__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (__v4si)(__m128i)(C), (I)); }) + (__v4si)(__m128i)(C), (I)) -#define _mm256_permute2_ps(X, Y, C, I) __extension__ ({ \ +#define _mm256_permute2_ps(X, Y, C, I) \ (__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ (__v8sf)(__m256)(Y), \ - (__v8si)(__m256i)(C), (I)); }) + (__v8si)(__m256i)(C), (I)) static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_frcz_ss(__m128 __A) @@ -765,18 +766,19 @@ _mm_frcz_pd(__m128d __A) return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); } -static __inline__ __m256 __DEFAULT_FN_ATTRS +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_frcz_ps(__m256 __A) { return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); } -static __inline__ __m256d __DEFAULT_FN_ATTRS +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_frcz_pd(__m256d __A) { return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); } #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS256 #endif /* __XOPINTRIN_H */ diff --git a/c_headers/xsavecintrin.h b/c_headers/xsavecintrin.h index 598470a682..25577a95fc 100644 --- a/c_headers/xsavecintrin.h +++ b/c_headers/xsavecintrin.h @@ -1,4 +1,4 @@ -/*===---- xsavecintrin.h - XSAVEC intrinsic ------------------------------------=== +/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/c_headers/xsaveintrin.h b/c_headers/xsaveintrin.h index a2e6b2e742..16f3a78d3f 100644 --- a/c_headers/xsaveintrin.h +++ b/c_headers/xsaveintrin.h @@ -1,4 +1,4 @@ -/*===---- xsaveintrin.h - XSAVE intrinsic ------------------------------------=== +/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,23 +33,23 @@ static __inline__ void __DEFAULT_FN_ATTRS _xsave(void *__p, unsigned long long __m) { - return __builtin_ia32_xsave(__p, __m); + __builtin_ia32_xsave(__p, __m); } static __inline__ void __DEFAULT_FN_ATTRS _xrstor(void *__p, unsigned long long __m) { - return __builtin_ia32_xrstor(__p, __m); + __builtin_ia32_xrstor(__p, __m); } #ifdef __x86_64__ static __inline__ void __DEFAULT_FN_ATTRS _xsave64(void *__p, unsigned long long __m) { - return __builtin_ia32_xsave64(__p, __m); + __builtin_ia32_xsave64(__p, __m); } static __inline__ void __DEFAULT_FN_ATTRS _xrstor64(void *__p, unsigned long long __m) { - return __builtin_ia32_xrstor64(__p, __m); + __builtin_ia32_xrstor64(__p, __m); } #endif diff --git a/c_headers/xsaveoptintrin.h b/c_headers/xsaveoptintrin.h index d3faae78be..792cf92d46 100644 --- a/c_headers/xsaveoptintrin.h +++ b/c_headers/xsaveoptintrin.h @@ -1,4 +1,4 @@ -/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ------------------------------------=== +/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -33,13 +33,13 @@ static __inline__ void __DEFAULT_FN_ATTRS _xsaveopt(void *__p, unsigned long long __m) { - return __builtin_ia32_xsaveopt(__p, __m); + __builtin_ia32_xsaveopt(__p, __m); } #ifdef __x86_64__ static __inline__ void __DEFAULT_FN_ATTRS _xsaveopt64(void *__p, unsigned long long __m) { - return __builtin_ia32_xsaveopt64(__p, __m); + __builtin_ia32_xsaveopt64(__p, __m); } #endif diff --git a/c_headers/xsavesintrin.h b/c_headers/xsavesintrin.h index c5e540a86e..fe2bc4b93b 100644 --- a/c_headers/xsavesintrin.h +++ b/c_headers/xsavesintrin.h @@ -1,4 +1,4 @@ -/*===---- xsavesintrin.h - XSAVES intrinsic ------------------------------------=== +/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal diff --git a/c_headers/xtestintrin.h b/c_headers/xtestintrin.h index 9d3378fd1e..924424386b 100644 --- a/c_headers/xtestintrin.h +++ b/c_headers/xtestintrin.h @@ -1,4 +1,4 @@ -/*===---- xtestintrin.h - XTEST intrinsic ---------------------------------=== +/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal |
