6 files changed, 15344 insertions, 0 deletions
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/common.h b/lib/libc/include/aarch64-macos-gnu/simd/common.h
new file mode 100644
index 0000000000..5408c535fd
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/common.h
@@ -0,0 +1,4458 @@
+/*! @header
+ *  The interfaces declared in this header provide "common" elementwise
+ *  operations that are neither math nor logic functions.  These are available
+ *  only for floating-point vectors and scalars, except for min, max, abs,
+ *  clamp, and the reduce operations, which also support integer vectors.
+ *
+ *      simd_abs(x)             Absolute value of x.  Also available as fabs
+ *                              for floating-point vectors.  If x is the
+ *                              smallest signed integer, x is returned.
+ *
+ *      simd_max(x,y)           Returns the maximum of x and y.  Also available
+ *                              as fmax for floating-point vectors.
+ *
+ *      simd_min(x,y)           Returns the minimum of x and y.  Also available
+ *                              as fmin for floating-point vectors.
+ *
+ *      simd_clamp(x,min,max)   x clamped to the range [min, max].
+ *
+ *      simd_sign(x)            -1 if x is less than zero, 0 if x is zero or
+ *                              NaN, and +1 if x is greater than zero.
+ *
+ *      simd_mix(x,y,t)         If t is not in the range [0,1], the result is
+ *                              undefined.  Otherwise the result is x+(y-x)*t,
+ *                              which linearly interpolates between x and y.
+ *
+ *      simd_recip(x)           An approximation to 1/x.  If x is very near the
+ *                              limits of representable values, or is infinity
+ *                              or NaN, the result is undefined.  There are
+ *                              two variants of this function:
+ *
+ *                                  simd_precise_recip(x)
+ *
+ *                              and
+ *
+ *                                  simd_fast_recip(x).
+ *
+ *                              The "precise" variant is accurate to a few ULPs,
+ *                              whereas the "fast" variant may have as little
+ *                              as 11 bits of accuracy in float and about 22
+ *                              bits in double.
+ *
+ *                              The function simd_recip(x) resolves to
+ *                              simd_precise_recip(x) ordinarily, but to
+ *                              simd_fast_recip(x) when used in a translation
+ *                              unit compiled with -ffast-math (when
+ *                              -ffast-math is in effect, you may still use the
+ *                              precise version of this function by calling it
+ *                              explicitly by name).
+ *
+ *      simd_rsqrt(x)           An approximation to 1/sqrt(x).  If x is
+ *                              infinity or NaN, the result is undefined.
+ *                              There are two variants of this function:
+ *
+ *                                  simd_precise_rsqrt(x)
+ *
+ *                              and
+ *
+ *                                  simd_fast_rsqrt(x).
+ *
+ *                              The "precise" variant is accurate to a few ULPs,
+ *                              whereas the "fast" variant may have as little
+ *                              as 11 bits of accuracy in float and about 22
+ *                              bits in double.
+ *
+ *                              The function simd_rsqrt(x) resolves to
+ *                              simd_precise_rsqrt(x) ordinarily, but to
+ *                              simd_fast_rsqrt(x) when used in a translation
+ *                              unit compiled with -ffast-math (when
+ *                              -ffast-math is in effect, you may still use the
+ *                              precise version of this function by calling it
+ *                              explicitly by name).
+ *
+ *      simd_fract(x)           The "fractional part" of x, which lies strictly
+ *                              in the range [0, 0x1.fffffep-1].
+ *
+ *      simd_step(edge,x)       0 if x < edge, and 1 otherwise.
+ *
+ *      simd_smoothstep(edge0,edge1,x) 0 if x <= edge0, 1 if x >= edge1, and
+ *                              a Hermite interpolation between 0 and 1 if
+ *                              edge0 < x < edge1.
+ *
+ *      simd_reduce_add(x)      Sum of the elements of x.
+ *
+ *      simd_reduce_min(x)      Minimum of the elements of x.
+ *
+ *      simd_reduce_max(x)      Maximum of the elements of x.
+ *
+ *      simd_equal(x,y)         True if and only if every lane of x is equal
+ *                              to the corresponding lane of y.
+ *
+ *  The following common functions are available in the simd:: namespace:
+ *
+ *      C++ Function                    Equivalent C Function
+ *      --------------------------------------------------------------------
+ *      simd::abs(x)                    simd_abs(x)
+ *      simd::max(x,y)                  simd_max(x,y)
+ *      simd::min(x,y)                  simd_min(x,y)
+ *      simd::clamp(x,min,max)          simd_clamp(x,min,max)
+ *      simd::sign(x)                   simd_sign(x)
+ *      simd::mix(x,y,t)                simd_mix(x,y,t)
+ *      simd::recip(x)                  simd_recip(x)
+ *      simd::rsqrt(x)                  simd_rsqrt(x)
+ *      simd::fract(x)                  simd_fract(x)
+ *      simd::step(edge,x)              simd_step(edge,x)
+ *      simd::smoothstep(e0,e1,x)       simd_smoothstep(e0,e1,x)
+ *      simd::reduce_add(x)             simd_reduce_add(x)
+ *      simd::reduce_max(x)             simd_reduce_max(x)
+ *      simd::reduce_min(x)             simd_reduce_min(x)
+ *      simd::equal(x,y)                simd_equal(x,y)
+ *
+ *      simd::precise::recip(x)         simd_precise_recip(x)
+ *      simd::precise::rsqrt(x)         simd_precise_rsqrt(x)
+ *
+ *      simd::fast::recip(x)            simd_fast_recip(x)
+ *      simd::fast::rsqrt(x)            simd_fast_rsqrt(x)
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_COMMON_HEADER
+#define SIMD_COMMON_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+#include <simd/math.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char2 simd_abs(simd_char2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char3 simd_abs(simd_char3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char4 simd_abs(simd_char4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char8 simd_abs(simd_char8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char16 simd_abs(simd_char16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char32 simd_abs(simd_char32 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_char64 simd_abs(simd_char64 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short2 simd_abs(simd_short2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short3 simd_abs(simd_short3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short4 simd_abs(simd_short4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short8 simd_abs(simd_short8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short16 simd_abs(simd_short16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_short32 simd_abs(simd_short32 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int2 simd_abs(simd_int2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int3 simd_abs(simd_int3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int4 simd_abs(simd_int4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int8 simd_abs(simd_int8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_int16 simd_abs(simd_int16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float2 simd_abs(simd_float2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float3 simd_abs(simd_float3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float4 simd_abs(simd_float4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float8 simd_abs(simd_float8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_float16 simd_abs(simd_float16 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long3 simd_abs(simd_long3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_long8 simd_abs(simd_long8 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double2 simd_abs(simd_double2 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double3 simd_abs(simd_double3 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double4 simd_abs(simd_double4 x);
+/*! @abstract The elementwise absolute value of x.                            */
+static inline SIMD_CFUNC simd_double8 simd_abs(simd_double8 x);
+/*! @abstract The elementwise absolute value of x.
+ *  @discussion Deprecated. Use simd_abs(x) instead.                          */
+#define vector_abs simd_abs
+  
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char2 simd_max(simd_char2 x, simd_char2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char3 simd_max(simd_char3 x, simd_char3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char4 simd_max(simd_char4 x, simd_char4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char8 simd_max(simd_char8 x, simd_char8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char16 simd_max(simd_char16 x, simd_char16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char32 simd_max(simd_char32 x, simd_char32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_char64 simd_max(simd_char64 x, simd_char64 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar2 simd_max(simd_uchar2 x, simd_uchar2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar3 simd_max(simd_uchar3 x, simd_uchar3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar4 simd_max(simd_uchar4 x, simd_uchar4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar8 simd_max(simd_uchar8 x, simd_uchar8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar16 simd_max(simd_uchar16 x, simd_uchar16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar32 simd_max(simd_uchar32 x, simd_uchar32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar64 simd_max(simd_uchar64 x, simd_uchar64 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short2 simd_max(simd_short2 x, simd_short2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short3 simd_max(simd_short3 x, simd_short3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short4 simd_max(simd_short4 x, simd_short4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short8 simd_max(simd_short8 x, simd_short8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short16 simd_max(simd_short16 x, simd_short16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_short32 simd_max(simd_short32 x, simd_short32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort2 simd_max(simd_ushort2 x, simd_ushort2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort3 simd_max(simd_ushort3 x, simd_ushort3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort4 simd_max(simd_ushort4 x, simd_ushort4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort8 simd_max(simd_ushort8 x, simd_ushort8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort16 simd_max(simd_ushort16 x, simd_ushort16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort32 simd_max(simd_ushort32 x, simd_ushort32 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int2 simd_max(simd_int2 x, simd_int2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int3 simd_max(simd_int3 x, simd_int3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int4 simd_max(simd_int4 x, simd_int4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int8 simd_max(simd_int8 x, simd_int8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_int16 simd_max(simd_int16 x, simd_int16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint2 simd_max(simd_uint2 x, simd_uint2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint3 simd_max(simd_uint3 x, simd_uint3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint4 simd_max(simd_uint4 x, simd_uint4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint8 simd_max(simd_uint8 x, simd_uint8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_uint16 simd_max(simd_uint16 x, simd_uint16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC float simd_max(float x, float y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float2 simd_max(simd_float2 x, simd_float2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float3 simd_max(simd_float3 x, simd_float3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float4 simd_max(simd_float4 x, simd_float4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float8 simd_max(simd_float8 x, simd_float8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_float16 simd_max(simd_float16 x, simd_float16 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long2 simd_max(simd_long2 x, simd_long2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long3 simd_max(simd_long3 x, simd_long3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long4 simd_max(simd_long4 x, simd_long4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_long8 simd_max(simd_long8 x, simd_long8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong2 simd_max(simd_ulong2 x, simd_ulong2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong3 simd_max(simd_ulong3 x, simd_ulong3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong4 simd_max(simd_ulong4 x, simd_ulong4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong8 simd_max(simd_ulong8 x, simd_ulong8 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC double simd_max(double x, double y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double2 simd_max(simd_double2 x, simd_double2 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double3 simd_max(simd_double3 x, simd_double3 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double4 simd_max(simd_double4 x, simd_double4 y);
+/*! @abstract The elementwise maximum of x and y.                             */
+static inline SIMD_CFUNC simd_double8 simd_max(simd_double8 x, simd_double8 y);
+/*! @abstract The elementwise maximum of x and y.
+ *  @discussion Deprecated. Use simd_max(x,y) instead.                        */
+#define vector_max simd_max
+
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char2 simd_min(simd_char2 x, simd_char2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char3 simd_min(simd_char3 x, simd_char3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char4 simd_min(simd_char4 x, simd_char4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char8 simd_min(simd_char8 x, simd_char8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char16 simd_min(simd_char16 x, simd_char16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char32 simd_min(simd_char32 x, simd_char32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_char64 simd_min(simd_char64 x, simd_char64 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar2 simd_min(simd_uchar2 x, simd_uchar2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar3 simd_min(simd_uchar3 x, simd_uchar3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar4 simd_min(simd_uchar4 x, simd_uchar4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar8 simd_min(simd_uchar8 x, simd_uchar8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar16 simd_min(simd_uchar16 x, simd_uchar16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar32 simd_min(simd_uchar32 x, simd_uchar32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uchar64 simd_min(simd_uchar64 x, simd_uchar64 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short2 simd_min(simd_short2 x, simd_short2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short3 simd_min(simd_short3 x, simd_short3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short4 simd_min(simd_short4 x, simd_short4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short8 simd_min(simd_short8 x, simd_short8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short16 simd_min(simd_short16 x, simd_short16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_short32 simd_min(simd_short32 x, simd_short32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort2 simd_min(simd_ushort2 x, simd_ushort2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort3 simd_min(simd_ushort3 x, simd_ushort3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort4 simd_min(simd_ushort4 x, simd_ushort4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort8 simd_min(simd_ushort8 x, simd_ushort8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort16 simd_min(simd_ushort16 x, simd_ushort16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ushort32 simd_min(simd_ushort32 x, simd_ushort32 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int2 simd_min(simd_int2 x, simd_int2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int3 simd_min(simd_int3 x, simd_int3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int4 simd_min(simd_int4 x, simd_int4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int8 simd_min(simd_int8 x, simd_int8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_int16 simd_min(simd_int16 x, simd_int16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint2 simd_min(simd_uint2 x, simd_uint2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint3 simd_min(simd_uint3 x, simd_uint3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint4 simd_min(simd_uint4 x, simd_uint4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint8 simd_min(simd_uint8 x, simd_uint8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_uint16 simd_min(simd_uint16 x, simd_uint16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC float simd_min(float x, float y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float2 simd_min(simd_float2 x, simd_float2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float3 simd_min(simd_float3 x, simd_float3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float4 simd_min(simd_float4 x, simd_float4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float8 simd_min(simd_float8 x, simd_float8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_float16 simd_min(simd_float16 x, simd_float16 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long2 simd_min(simd_long2 x, simd_long2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long3 simd_min(simd_long3 x, simd_long3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long4 simd_min(simd_long4 x, simd_long4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_long8 simd_min(simd_long8 x, simd_long8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong2 simd_min(simd_ulong2 x, simd_ulong2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong3 simd_min(simd_ulong3 x, simd_ulong3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong4 simd_min(simd_ulong4 x, simd_ulong4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_ulong8 simd_min(simd_ulong8 x, simd_ulong8 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC double simd_min(double x, double y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double2 simd_min(simd_double2 x, simd_double2 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double3 simd_min(simd_double3 x, simd_double3 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double4 simd_min(simd_double4 x, simd_double4 y);
+/*! @abstract The elementwise minimum of x and y.                             */
+static inline SIMD_CFUNC simd_double8 simd_min(simd_double8 x, simd_double8 y);
+/*! @abstract The elementwise minimum of x and y.
+ *  @discussion Deprecated. Use simd_min(x,y) instead.                        */
+#define vector_min simd_min
+
+  
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char2 simd_clamp(simd_char2 x, simd_char2 min, simd_char2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char3 simd_clamp(simd_char3 x, simd_char3 min, simd_char3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char4 simd_clamp(simd_char4 x, simd_char4 min, simd_char4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char8 simd_clamp(simd_char8 x, simd_char8 min, simd_char8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char16 simd_clamp(simd_char16 x, simd_char16 min, simd_char16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char32 simd_clamp(simd_char32 x, simd_char32 min, simd_char32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_char64 simd_clamp(simd_char64 x, simd_char64 min, simd_char64 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar2 simd_clamp(simd_uchar2 x, simd_uchar2 min, simd_uchar2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar3 simd_clamp(simd_uchar3 x, simd_uchar3 min, simd_uchar3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar4 simd_clamp(simd_uchar4 x, simd_uchar4 min, simd_uchar4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar8 simd_clamp(simd_uchar8 x, simd_uchar8 min, simd_uchar8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar16 simd_clamp(simd_uchar16 x, simd_uchar16 min, simd_uchar16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar32 simd_clamp(simd_uchar32 x, simd_uchar32 min, simd_uchar32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uchar64 simd_clamp(simd_uchar64 x, simd_uchar64 min, simd_uchar64 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short2 simd_clamp(simd_short2 x, simd_short2 min, simd_short2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short3 simd_clamp(simd_short3 x, simd_short3 min, simd_short3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short4 simd_clamp(simd_short4 x, simd_short4 min, simd_short4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short8 simd_clamp(simd_short8 x, simd_short8 min, simd_short8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short16 simd_clamp(simd_short16 x, simd_short16 min, simd_short16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_short32 simd_clamp(simd_short32 x, simd_short32 min, simd_short32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort2 simd_clamp(simd_ushort2 x, simd_ushort2 min, simd_ushort2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort3 simd_clamp(simd_ushort3 x, simd_ushort3 min, simd_ushort3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort4 simd_clamp(simd_ushort4 x, simd_ushort4 min, simd_ushort4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort8 simd_clamp(simd_ushort8 x, simd_ushort8 min, simd_ushort8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort16 simd_clamp(simd_ushort16 x, simd_ushort16 min, simd_ushort16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ushort32 simd_clamp(simd_ushort32 x, simd_ushort32 min, simd_ushort32 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int2 simd_clamp(simd_int2 x, simd_int2 min, simd_int2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int3 simd_clamp(simd_int3 x, simd_int3 min, simd_int3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int4 simd_clamp(simd_int4 x, simd_int4 min, simd_int4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int8 simd_clamp(simd_int8 x, simd_int8 min, simd_int8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_int16 simd_clamp(simd_int16 x, simd_int16 min, simd_int16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint2 simd_clamp(simd_uint2 x, simd_uint2 min, simd_uint2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint3 simd_clamp(simd_uint3 x, simd_uint3 min, simd_uint3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint4 simd_clamp(simd_uint4 x, simd_uint4 min, simd_uint4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint8 simd_clamp(simd_uint8 x, simd_uint8 min, simd_uint8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_uint16 simd_clamp(simd_uint16 x, simd_uint16 min, simd_uint16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC float simd_clamp(float x, float min, float max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float2 simd_clamp(simd_float2 x, simd_float2 min, simd_float2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float3 simd_clamp(simd_float3 x, simd_float3 min, simd_float3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float4 simd_clamp(simd_float4 x, simd_float4 min, simd_float4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float8 simd_clamp(simd_float8 x, simd_float8 min, simd_float8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_float16 simd_clamp(simd_float16 x, simd_float16 min, simd_float16 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long2 simd_clamp(simd_long2 x, simd_long2 min, simd_long2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long3 simd_clamp(simd_long3 x, simd_long3 min, simd_long3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long4 simd_clamp(simd_long4 x, simd_long4 min, simd_long4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_long8 simd_clamp(simd_long8 x, simd_long8 min, simd_long8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong2 simd_clamp(simd_ulong2 x, simd_ulong2 min, simd_ulong2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong3 simd_clamp(simd_ulong3 x, simd_ulong3 min, simd_ulong3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong4 simd_clamp(simd_ulong4 x, simd_ulong4 min, simd_ulong4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_ulong8 simd_clamp(simd_ulong8 x, simd_ulong8 min, simd_ulong8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC double simd_clamp(double x, double min, double max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double2 simd_clamp(simd_double2 x, simd_double2 min, simd_double2 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double3 simd_clamp(simd_double3 x, simd_double3 min, simd_double3 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double4 simd_clamp(simd_double4 x, simd_double4 min, simd_double4 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Note that if you want to clamp all lanes to the same range,
+ *  you can use a scalar value for min and max.                               */
+static inline SIMD_CFUNC simd_double8 simd_clamp(simd_double8 x, simd_double8 min, simd_double8 max);
+/*! @abstract x clamped to the range [min, max].
+ *  @discussion Deprecated. Use simd_clamp(x,min,max) instead.                */
+#define vector_clamp simd_clamp
+  
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC float simd_sign(float x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float2 simd_sign(simd_float2 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float3 simd_sign(simd_float3 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float4 simd_sign(simd_float4 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float8 simd_sign(simd_float8 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_float16 simd_sign(simd_float16 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC double simd_sign(double x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double2 simd_sign(simd_double2 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double3 simd_sign(simd_double3 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double4 simd_sign(simd_double4 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.      */
+static inline SIMD_CFUNC simd_double8 simd_sign(simd_double8 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.
+ *  @discussion Deprecated. Use simd_sign(x) instead.                         */
+#define vector_sign simd_sign
+
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC float simd_mix(float x, float y, float t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float2 simd_mix(simd_float2 x, simd_float2 y, simd_float2 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float3 simd_mix(simd_float3 x, simd_float3 y, simd_float3 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float4 simd_mix(simd_float4 x, simd_float4 y, simd_float4 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float8 simd_mix(simd_float8 x, simd_float8 y, simd_float8 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_float16 simd_mix(simd_float16 x, simd_float16 y, simd_float16 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC double simd_mix(double x, double y, double t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double2 simd_mix(simd_double2 x, simd_double2 y, simd_double2 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double3 simd_mix(simd_double3 x, simd_double3 y, simd_double3 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double4 simd_mix(simd_double4 x, simd_double4 y, simd_double4 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1                                                        */
+static inline SIMD_CFUNC simd_double8 simd_mix(simd_double8 x, simd_double8 y, simd_double8 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ *  t=0 and y when t=1
+ *  @discussion Deprecated. Use simd_mix(x, y, t) instead.                    */
+#define vector_mix simd_mix
+
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC float simd_precise_recip(float x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float2 simd_precise_recip(simd_float2 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float3 simd_precise_recip(simd_float3 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float4 simd_precise_recip(simd_float4 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float8 simd_precise_recip(simd_float8 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_float16 simd_precise_recip(simd_float16 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC double simd_precise_recip(double x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double2 simd_precise_recip(simd_double2 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double3 simd_precise_recip(simd_double3 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double4 simd_precise_recip(simd_double4 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  a few units in the last place (ULPs).                                     */
+static inline SIMD_CFUNC simd_double8 simd_precise_recip(simd_double8 x);
+/*! @abstract A good approximation to 1/x.
+ *  @discussion Deprecated. Use simd_precise_recip(x) instead.                */
+#define vector_precise_recip simd_precise_recip
+
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC float simd_fast_recip(float x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float2 simd_fast_recip(simd_float2 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float3 simd_fast_recip(simd_float3 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float4 simd_fast_recip(simd_float4 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float8 simd_fast_recip(simd_float8 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_float16 simd_fast_recip(simd_float16 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC double simd_fast_recip(double x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double2 simd_fast_recip(simd_double2 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double3 simd_fast_recip(simd_double3 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double4 simd_fast_recip(simd_double4 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow; otherwise this function is accurate to
+ *  at least 11 bits for float and 22 bits for double.                        */
+static inline SIMD_CFUNC simd_double8 simd_fast_recip(simd_double8 x);
+/*! @abstract A fast approximation to 1/x.
+ *  @discussion Deprecated. Use simd_fast_recip(x) instead.                   */
+#define vector_fast_recip simd_fast_recip
+
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC float simd_recip(float x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float2 simd_recip(simd_float2 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float3 simd_recip(simd_float3 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float4 simd_recip(simd_float4 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float8 simd_recip(simd_float8 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_float16 simd_recip(simd_float16 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC double simd_recip(double x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double2 simd_recip(simd_double2 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double3 simd_recip(simd_double3 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double4 simd_recip(simd_double4 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion If x is very close to the limits of representation, the
+ *  result may overflow or underflow. This function maps to
+ *  simd_fast_recip(x) if -ffast-math is specified, and to
+ *  simd_precise_recip(x) otherwise.                                          */
+static inline SIMD_CFUNC simd_double8 simd_recip(simd_double8 x);
+/*! @abstract An approximation to 1/x.
+ *  @discussion Deprecated. Use simd_recip(x) instead.                        */
+#define vector_recip simd_recip
+
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC float simd_precise_rsqrt(float x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float2 simd_precise_rsqrt(simd_float2 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float3 simd_precise_rsqrt(simd_float3 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float4 simd_precise_rsqrt(simd_float4 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float8 simd_precise_rsqrt(simd_float8 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_float16 simd_precise_rsqrt(simd_float16 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC double simd_precise_rsqrt(double x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double2 simd_precise_rsqrt(simd_double2 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double3 simd_precise_rsqrt(simd_double3 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double4 simd_precise_rsqrt(simd_double4 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to a few units in the last place
+ *  (ULPs).                                                                   */
+static inline SIMD_CFUNC simd_double8 simd_precise_rsqrt(simd_double8 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ *  @discussion Deprecated. Use simd_precise_rsqrt(x) instead.                */
+#define vector_precise_rsqrt simd_precise_rsqrt
+
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC float simd_fast_rsqrt(float x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float2 simd_fast_rsqrt(simd_float2 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float3 simd_fast_rsqrt(simd_float3 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float4 simd_fast_rsqrt(simd_float4 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float8 simd_fast_rsqrt(simd_float8 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_float16 simd_fast_rsqrt(simd_float16 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC double simd_fast_rsqrt(double x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double2 simd_fast_rsqrt(simd_double2 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double3 simd_fast_rsqrt(simd_double3 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double4 simd_fast_rsqrt(simd_double4 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion This function is accurate to at least 11 bits for float and
+ *  22 bits for double.                                                       */
+static inline SIMD_CFUNC simd_double8 simd_fast_rsqrt(simd_double8 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ *  @discussion Deprecated. Use simd_fast_rsqrt(x) instead.                   */
+#define vector_fast_rsqrt simd_fast_rsqrt
+
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC float simd_rsqrt(float x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float2 simd_rsqrt(simd_float2 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float3 simd_rsqrt(simd_float3 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float4 simd_rsqrt(simd_float4 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float8 simd_rsqrt(simd_float8 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_float16 simd_rsqrt(simd_float16 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC double simd_rsqrt(double x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double2 simd_rsqrt(simd_double2 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double3 simd_rsqrt(simd_double3 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double4 simd_rsqrt(simd_double4 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ *  specified, and to simd_precise_recip(x) otherwise.                        */
+static inline SIMD_CFUNC simd_double8 simd_rsqrt(simd_double8 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ *  @discussion Deprecated. Use simd_rsqrt(x) instead.                        */
+#define vector_rsqrt simd_rsqrt
+
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC float simd_fract(float x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float2 simd_fract(simd_float2 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float3 simd_fract(simd_float3 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float4 simd_fract(simd_float4 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float8 simd_fract(simd_float8 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_float16 simd_fract(simd_float16 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC double simd_fract(double x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double2 simd_fract(simd_double2 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double3 simd_fract(simd_double3 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double4 simd_fract(simd_double4 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ *  positive and finite, then the two values are exactly equal.               */
+static inline SIMD_CFUNC simd_double8 simd_fract(simd_double8 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ *  @discussion Deprecated. Use simd_fract(x) instead.                        */
+#define vector_fract simd_fract
+
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC float simd_step(float edge, float x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float2 simd_step(simd_float2 edge, simd_float2 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float3 simd_step(simd_float3 edge, simd_float3 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float4 simd_step(simd_float4 edge, simd_float4 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float8 simd_step(simd_float8 edge, simd_float8 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_float16 simd_step(simd_float16 edge, simd_float16 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC double simd_step(double edge, double x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double2 simd_step(simd_double2 edge, simd_double2 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double3 simd_step(simd_double3 edge, simd_double3 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double4 simd_step(simd_double4 edge, simd_double4 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Use a scalar value for edge if you want to apply the same
+ *  threshold to all lanes.                                                   */
+static inline SIMD_CFUNC simd_double8 simd_step(simd_double8 edge, simd_double8 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ *  @discussion Deprecated. Use simd_step(edge, x) instead.                   */
+#define vector_step simd_step
+
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC float simd_smoothstep(float edge0, float edge1, float x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float2 simd_smoothstep(simd_float2 edge0, simd_float2 edge1, simd_float2 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float3 simd_smoothstep(simd_float3 edge0, simd_float3 edge1, simd_float3 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float4 simd_smoothstep(simd_float4 edge0, simd_float4 edge1, simd_float4 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float8 simd_smoothstep(simd_float8 edge0, simd_float8 edge1, simd_float8 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_float16 simd_smoothstep(simd_float16 edge0, simd_float16 edge1, simd_float16 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC double simd_smoothstep(double edge0, double edge1, double x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double2 simd_smoothstep(simd_double2 edge0, simd_double2 edge1, simd_double2 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double3 simd_smoothstep(simd_double3 edge0, simd_double3 edge1, simd_double3 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double4 simd_smoothstep(simd_double4 edge0, simd_double4 edge1, simd_double4 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion You can use a scalar value for edge0 and edge1 if you want
+ *  to clamp all lanes at the same points.                                    */
+static inline SIMD_CFUNC simd_double8 simd_smoothstep(simd_double8 edge0, simd_double8 edge1, simd_double8 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ *  @discussion Deprecated. Use simd_smoothstep(edge0, edge1, x) instead.     */
+#define vector_smoothstep simd_smoothstep
+
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char64 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar64 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort32 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float16 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double2 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double3 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double4 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion This computation may overflow; especial for 8-bit types you
+ *  may need to convert to a wider type before reducing.                      */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double8 x);
+/*! @abstract Sum of elements in x.
+ *  @discussion Deprecated. Use simd_add(x) instead.                          */
+#define vector_reduce_add simd_reduce_add
+  
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char64 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar64 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort32 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float16 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong8 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double2 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double3 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double4 x);
+/*! @abstract Minimum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double8 x);
+/*! @abstract Minimum of elements in x.
+ *  @discussion Deprecated. Use simd_min(x) instead.                          */
+#define vector_reduce_min simd_reduce_min
+  
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char64 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar64 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort32 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float16 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong8 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double2 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double3 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double4 x);
+/*! @abstract Maximum of elements in x.                                       */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double8 x);
+/*! @abstract Maximum of elements in x.
+ *  @discussion Deprecated. Use simd_max(x) instead.                          */
+#define vector_reduce_max simd_reduce_max
+  
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char2 x, simd_char2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char3 x, simd_char3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char4 x, simd_char4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char8 x, simd_char8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char16 x, simd_char16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char32 x, simd_char32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char64 x, simd_char64 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar2 x, simd_uchar2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar3 x, simd_uchar3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar4 x, simd_uchar4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar8 x, simd_uchar8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar16 x, simd_uchar16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar32 x, simd_uchar32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar64 x, simd_uchar64 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short2 x, simd_short2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short3 x, simd_short3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short4 x, simd_short4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short8 x, simd_short8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short16 x, simd_short16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short32 x, simd_short32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort2 x, simd_ushort2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort3 x, simd_ushort3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort4 x, simd_ushort4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort8 x, simd_ushort8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort16 x, simd_ushort16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort32 x, simd_ushort32 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int2 x, simd_int2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int3 x, simd_int3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int4 x, simd_int4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int8 x, simd_int8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int16 x, simd_int16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint2 x, simd_uint2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint3 x, simd_uint3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint4 x, simd_uint4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint8 x, simd_uint8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint16 x, simd_uint16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float2 x, simd_float2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float3 x, simd_float3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float4 x, simd_float4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float8 x, simd_float8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float16 x, simd_float16 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long2 x, simd_long2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long3 x, simd_long3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long4 x, simd_long4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long8 x, simd_long8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong2 x, simd_ulong2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong3 x, simd_ulong3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong4 x, simd_ulong4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong8 x, simd_ulong8 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double2 x, simd_double2 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double3 x, simd_double3 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double4 x, simd_double4 y) {
+  return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ *  corresponding lane of y.                                                  */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double8 x, simd_double8 y) {
+  return simd_all(x == y);
+}
+  
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+  /*! @abstract The lanewise absolute value of x.                             */
+  template <typename typeN> static SIMD_CPPFUNC typeN abs(const typeN x) { return ::simd_abs(x); }
+  /*! @abstract The lanewise maximum of x and y.                              */
+  template <typename typeN> static SIMD_CPPFUNC typeN max(const typeN x, const typeN y) { return ::simd_max(x,y); }
+  /*! @abstract The lanewise minimum of x and y.                              */
+  template <typename typeN> static SIMD_CPPFUNC typeN min(const typeN x, const typeN y) { return ::simd_min(x,y); }
+  /*! @abstract x clamped to the interval [min, max].                         */
+  template <typename typeN> static SIMD_CPPFUNC typeN clamp(const typeN x, const typeN min, const typeN max) { return ::simd_clamp(x,min,max); }
+  /*! @abstract -1 if x < 0, +1 if x > 0, and 0 otherwise.                    */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN sign(const fptypeN x) { return ::simd_sign(x); }
+  /*! @abstract Linearly interpolates between x and y, taking the value x when t=0 and y when t=1 */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN mix(const fptypeN x, const fptypeN y, const fptypeN t) { return ::simd_mix(x,y,t); }
+  /*! @abstract An approximation to 1/x.                                      */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return simd_recip(x); }
+  /*! @abstract An approximation to 1/sqrt(x).                                */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return simd_rsqrt(x); }
+  /*! @abstract The "fracional part" of x, in the range [0,1).                */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN fract(const fptypeN x) { return ::simd_fract(x); }
+  /*! @abstract 0 if x < edge, 1 otherwise.                                   */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN step(const fptypeN edge, const fptypeN x) { return ::simd_step(edge,x); }
+  /*! @abstract smoothly interpolates from 0 at edge0 to 1 at edge1.          */
+  template <typename fptypeN> static SIMD_CPPFUNC fptypeN smoothstep(const fptypeN edge0, const fptypeN edge1, const fptypeN x) { return ::simd_smoothstep(edge0,edge1,x); }
+  /*! @abstract True if and only if each lane of x is equal to the
+   *  corresponding lane of y.
+   *
+   *  @discussion This isn't operator== because that's already defined by
+   *  the compiler to return a lane mask.                                     */
+  template <typename fptypeN> static SIMD_CPPFUNC simd_bool equal(const fptypeN x, const fptypeN y) { return ::simd_equal(x, y); }
+#if __cpp_decltype_auto
+  /*  If you are targeting an earlier version of the C++ standard that lacks
+   decltype_auto support, you may use the C-style simd_reduce_* functions
+   instead.                                                                   */
+  /*! @abstract The sum of the elements in x. May overflow.                   */
+  template <typename typeN> static SIMD_CPPFUNC auto reduce_add(typeN x) { return ::simd_reduce_add(x); }
+  /*! @abstract The least element in x.                                       */
+  template <typename typeN> static SIMD_CPPFUNC auto reduce_min(typeN x) { return ::simd_reduce_min(x); }
+  /*! @abstract The greatest element in x.                                    */
+  template <typename typeN> static SIMD_CPPFUNC auto reduce_max(typeN x) { return ::simd_reduce_max(x); }
+#endif
+  namespace precise {
+    /*! @abstract An approximation to 1/x.                                      */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return ::simd_precise_recip(x); }
+    /*! @abstract An approximation to 1/sqrt(x).                                */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return ::simd_precise_rsqrt(x); }
+  }
+  namespace fast {
+    /*! @abstract An approximation to 1/x.                                      */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return ::simd_fast_recip(x); }
+    /*! @abstract An approximation to 1/sqrt(x).                                */
+    template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return ::simd_fast_rsqrt(x); }
+  }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementation
+
+static inline SIMD_CFUNC simd_char2 simd_abs(simd_char2 x) {
+  return simd_make_char2(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_abs(simd_char3 x) {
+  return simd_make_char3(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_abs(simd_char4 x) {
+  return simd_make_char4(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_abs(simd_char8 x) {
+#if defined __arm__ || defined __arm64__
+  return vabs_s8(x);
+#else
+  return simd_make_char8(simd_abs(simd_make_char16_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_char16 simd_abs(simd_char16 x) {
+#if defined __arm__ || defined __arm64__
+  return vabsq_s8(x);
+#elif defined __SSE4_1__
+  return (simd_char16) _mm_abs_epi8((__m128i)x);
+#else
+  simd_char16 mask = x >> 7; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_abs(simd_char32 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi8(x);
+#else
+  return simd_make_char32(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_abs(simd_char64 x) {
+#if defined __AVX512BW__
+  return _mm512_abs_epi8(x);
+#else
+  return simd_make_char64(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_abs(simd_short2 x) {
+  return simd_make_short2(simd_abs(simd_make_short4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_abs(simd_short3 x) {
+  return simd_make_short3(simd_abs(simd_make_short4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_abs(simd_short4 x) {
+#if defined __arm__ || defined __arm64__
+  return vabs_s16(x);
+#else
+  return simd_make_short4(simd_abs(simd_make_short8_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short8 simd_abs(simd_short8 x) {
+#if defined __arm__ || defined __arm64__
+  return vabsq_s16(x);
+#elif defined __SSE4_1__
+  return (simd_short8) _mm_abs_epi16((__m128i)x);
+#else
+  simd_short8 mask = x >> 15; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_abs(simd_short16 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi16(x);
+#else
+  return simd_make_short16(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_abs(simd_short32 x) {
+#if defined __AVX512BW__
+  return _mm512_abs_epi16(x);
+#else
+  return simd_make_short32(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_abs(simd_int2 x) {
+#if defined __arm__ || defined __arm64__
+  return vabs_s32(x);
+#else
+  return simd_make_int2(simd_abs(simd_make_int4_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int3 simd_abs(simd_int3 x) {
+  return simd_make_int3(simd_abs(simd_make_int4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_abs(simd_int4 x) {
+#if defined __arm__ || defined __arm64__
+  return vabsq_s32(x);
+#elif defined __SSE4_1__
+  return (simd_int4) _mm_abs_epi32((__m128i)x);
+#else
+  simd_int4 mask = x >> 31; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_abs(simd_int8 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi32(x);
+#else
+  return simd_make_int8(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_abs(simd_int16 x) {
+#if defined __AVX512F__
+  return _mm512_abs_epi32(x);
+#else
+  return simd_make_int16(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_abs(simd_float2 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_abs(simd_float3 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_abs(simd_float4 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_abs(simd_float8 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_abs(simd_float16 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x) {
+#if defined __arm64__
+  return vabsq_s64(x);
+#elif defined __SSE4_1__
+  return (simd_long2) _mm_abs_epi64((__m128i)x);
+#else
+  simd_long2 mask = x >> 63; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_abs(simd_long3 x) {
+  return simd_make_long3(simd_abs(simd_make_long4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x) {
+#if defined __AVX2__
+  return _mm256_abs_epi64(x);
+#else
+  return simd_make_long4(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_abs(simd_long8 x) {
+#if defined __AVX512F__
+  return _mm512_abs_epi64(x);
+#else
+  return simd_make_long8(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_abs(simd_double2 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_abs(simd_double3 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_abs(simd_double4 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_abs(simd_double8 x) {
+  return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_min(simd_char2 x, simd_char2 y) {
+  return simd_make_char2(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_min(simd_char3 x, simd_char3 y) {
+  return simd_make_char3(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_min(simd_char4 x, simd_char4 y) {
+  return simd_make_char4(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_min(simd_char8 x, simd_char8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_s8(x, y);
+#else
+  return simd_make_char8(simd_min(simd_make_char16_undef(x), simd_make_char16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_char16 simd_min(simd_char16 x, simd_char16 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_s8(x, y);
+#elif defined __SSE4_1__
+  return (simd_char16) _mm_min_epi8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_min(simd_char32 x, simd_char32 y) {
+#if defined __AVX2__
+  return _mm256_min_epi8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_min(simd_char64 x, simd_char64 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epi8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_min(simd_uchar2 x, simd_uchar2 y) {
+  return simd_make_uchar2(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_min(simd_uchar3 x, simd_uchar3 y) {
+  return simd_make_uchar3(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_min(simd_uchar4 x, simd_uchar4 y) {
+  return simd_make_uchar4(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_min(simd_uchar8 x, simd_uchar8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_u8(x, y);
+#else
+  return simd_make_uchar8(simd_min(simd_make_uchar16_undef(x), simd_make_uchar16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_min(simd_uchar16 x, simd_uchar16 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_u8(x, y);
+#elif defined __SSE4_1__
+  return (simd_uchar16) _mm_min_epu8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_min(simd_uchar32 x, simd_uchar32 y) {
+#if defined __AVX2__
+  return _mm256_min_epu8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_min(simd_uchar64 x, simd_uchar64 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epu8(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_min(simd_short2 x, simd_short2 y) {
+  return simd_make_short2(simd_min(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_min(simd_short3 x, simd_short3 y) {
+  return simd_make_short3(simd_min(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_min(simd_short4 x, simd_short4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_s16(x, y);
+#else
+  return simd_make_short4(simd_min(simd_make_short8_undef(x), simd_make_short8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_short8 simd_min(simd_short8 x, simd_short8 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_s16(x, y);
+#elif defined __SSE4_1__
+  return (simd_short8) _mm_min_epi16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_min(simd_short16 x, simd_short16 y) {
+#if defined __AVX2__
+  return _mm256_min_epi16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_min(simd_short32 x, simd_short32 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epi16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_min(simd_ushort2 x, simd_ushort2 y) {
+  return simd_make_ushort2(simd_min(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_min(simd_ushort3 x, simd_ushort3 y) {
+  return simd_make_ushort3(simd_min(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_min(simd_ushort4 x, simd_ushort4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_u16(x, y);
+#else
+  return simd_make_ushort4(simd_min(simd_make_ushort8_undef(x), simd_make_ushort8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_min(simd_ushort8 x, simd_ushort8 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_u16(x, y);
+#elif defined __SSE4_1__
+  return (simd_ushort8) _mm_min_epu16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_min(simd_ushort16 x, simd_ushort16 y) {
+#if defined __AVX2__
+  return _mm256_min_epu16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_min(simd_ushort32 x, simd_ushort32 y) {
+#if defined __AVX512BW__
+  return _mm512_min_epu16(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_min(simd_int2 x, simd_int2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_s32(x, y);
+#else
+  return simd_make_int2(simd_min(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_int3 simd_min(simd_int3 x, simd_int3 y) {
+  return simd_make_int3(simd_min(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_min(simd_int4 x, simd_int4 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_s32(x, y);
+#elif defined __SSE4_1__
+  return (simd_int4) _mm_min_epi32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_min(simd_int8 x, simd_int8 y) {
+#if defined __AVX2__
+  return _mm256_min_epi32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_min(simd_int16 x, simd_int16 y) {
+#if defined __AVX512F__
+  return _mm512_min_epi32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_min(simd_uint2 x, simd_uint2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmin_u32(x, y);
+#else
+  return simd_make_uint2(simd_min(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_min(simd_uint3 x, simd_uint3 y) {
+  return simd_make_uint3(simd_min(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_min(simd_uint4 x, simd_uint4 y) {
+#if defined __arm__ || defined __arm64__
+  return vminq_u32(x, y);
+#elif defined __SSE4_1__
+  return (simd_uint4) _mm_min_epu32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_min(simd_uint8 x, simd_uint8 y) {
+#if defined __AVX2__
+  return _mm256_min_epu32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_min(simd_uint16 x, simd_uint16 y) {
+#if defined __AVX512F__
+  return _mm512_min_epu32(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_min(float x, float y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_min(simd_float2 x, simd_float2 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_min(simd_float3 x, simd_float3 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_min(simd_float4 x, simd_float4 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_min(simd_float8 x, simd_float8 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_min(simd_float16 x, simd_float16 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_min(simd_long2 x, simd_long2 y) {
+#if defined __AVX512VL__
+  return _mm_min_epi64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_min(simd_long3 x, simd_long3 y) {
+  return simd_make_long3(simd_min(simd_make_long4_undef(x), simd_make_long4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_min(simd_long4 x, simd_long4 y) {
+#if defined __AVX512VL__
+  return _mm256_min_epi64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_min(simd_long8 x, simd_long8 y) {
+#if defined __AVX512F__
+  return _mm512_min_epi64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_min(simd_ulong2 x, simd_ulong2 y) {
+#if defined __AVX512VL__
+  return _mm_min_epu64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_min(simd_ulong3 x, simd_ulong3 y) {
+  return simd_make_ulong3(simd_min(simd_make_ulong4_undef(x), simd_make_ulong4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_min(simd_ulong4 x, simd_ulong4 y) {
+#if defined __AVX512VL__
+  return _mm256_min_epu64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_min(simd_ulong8 x, simd_ulong8 y) {
+#if defined __AVX512F__
+  return _mm512_min_epu64(x, y);
+#else
+  return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_min(double x, double y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_min(simd_double2 x, simd_double2 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_min(simd_double3 x, simd_double3 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_min(simd_double4 x, simd_double4 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_min(simd_double8 x, simd_double8 y) {
+  return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_max(simd_char2 x, simd_char2 y) {
+  return simd_make_char2(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_max(simd_char3 x, simd_char3 y) {
+  return simd_make_char3(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_max(simd_char4 x, simd_char4 y) {
+  return simd_make_char4(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_max(simd_char8 x, simd_char8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_s8(x, y);
+#else
+  return simd_make_char8(simd_max(simd_make_char16_undef(x), simd_make_char16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_char16 simd_max(simd_char16 x, simd_char16 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_s8(x, y);
+#elif defined __SSE4_1__
+  return (simd_char16) _mm_max_epi8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_max(simd_char32 x, simd_char32 y) {
+#if defined __AVX2__
+  return _mm256_max_epi8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_max(simd_char64 x, simd_char64 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epi8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_max(simd_uchar2 x, simd_uchar2 y) {
+  return simd_make_uchar2(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_max(simd_uchar3 x, simd_uchar3 y) {
+  return simd_make_uchar3(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_max(simd_uchar4 x, simd_uchar4 y) {
+  return simd_make_uchar4(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_max(simd_uchar8 x, simd_uchar8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_u8(x, y);
+#else
+  return simd_make_uchar8(simd_max(simd_make_uchar16_undef(x), simd_make_uchar16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_max(simd_uchar16 x, simd_uchar16 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_u8(x, y);
+#elif defined __SSE4_1__
+  return (simd_uchar16) _mm_max_epu8((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_max(simd_uchar32 x, simd_uchar32 y) {
+#if defined __AVX2__
+  return _mm256_max_epu8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_max(simd_uchar64 x, simd_uchar64 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epu8(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_max(simd_short2 x, simd_short2 y) {
+  return simd_make_short2(simd_max(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_max(simd_short3 x, simd_short3 y) {
+  return simd_make_short3(simd_max(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_max(simd_short4 x, simd_short4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_s16(x, y);
+#else
+  return simd_make_short4(simd_max(simd_make_short8_undef(x), simd_make_short8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_short8 simd_max(simd_short8 x, simd_short8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_s16(x, y);
+#elif defined __SSE4_1__
+  return (simd_short8) _mm_max_epi16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_max(simd_short16 x, simd_short16 y) {
+#if defined __AVX2__
+  return _mm256_max_epi16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_max(simd_short32 x, simd_short32 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epi16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_max(simd_ushort2 x, simd_ushort2 y) {
+  return simd_make_ushort2(simd_max(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_max(simd_ushort3 x, simd_ushort3 y) {
+  return simd_make_ushort3(simd_max(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_max(simd_ushort4 x, simd_ushort4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_u16(x, y);
+#else
+  return simd_make_ushort4(simd_max(simd_make_ushort8_undef(x), simd_make_ushort8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_max(simd_ushort8 x, simd_ushort8 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_u16(x, y);
+#elif defined __SSE4_1__
+  return (simd_ushort8) _mm_max_epu16((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_max(simd_ushort16 x, simd_ushort16 y) {
+#if defined __AVX2__
+  return _mm256_max_epu16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_max(simd_ushort32 x, simd_ushort32 y) {
+#if defined __AVX512BW__
+  return _mm512_max_epu16(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_max(simd_int2 x, simd_int2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_s32(x, y);
+#else
+  return simd_make_int2(simd_max(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_int3 simd_max(simd_int3 x, simd_int3 y) {
+  return simd_make_int3(simd_max(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_max(simd_int4 x, simd_int4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_s32(x, y);
+#elif defined __SSE4_1__
+  return (simd_int4) _mm_max_epi32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_max(simd_int8 x, simd_int8 y) {
+#if defined __AVX2__
+  return _mm256_max_epi32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_max(simd_int16 x, simd_int16 y) {
+#if defined __AVX512F__
+  return _mm512_max_epi32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_max(simd_uint2 x, simd_uint2 y) {
+#if defined __arm__ || defined __arm64__
+  return vmax_u32(x, y);
+#else
+  return simd_make_uint2(simd_max(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_max(simd_uint3 x, simd_uint3 y) {
+  return simd_make_uint3(simd_max(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_max(simd_uint4 x, simd_uint4 y) {
+#if defined __arm__ || defined __arm64__
+  return vmaxq_u32(x, y);
+#elif defined __SSE4_1__
+  return (simd_uint4) _mm_max_epu32((__m128i)x, (__m128i)y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_max(simd_uint8 x, simd_uint8 y) {
+#if defined __AVX2__
+  return _mm256_max_epu32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_max(simd_uint16 x, simd_uint16 y) {
+#if defined __AVX512F__
+  return _mm512_max_epu32(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_max(float x, float y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_max(simd_float2 x, simd_float2 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_max(simd_float3 x, simd_float3 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_max(simd_float4 x, simd_float4 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_max(simd_float8 x, simd_float8 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_max(simd_float16 x, simd_float16 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_max(simd_long2 x, simd_long2 y) {
+#if defined __AVX512VL__
+  return _mm_max_epi64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_max(simd_long3 x, simd_long3 y) {
+  return simd_make_long3(simd_max(simd_make_long4_undef(x), simd_make_long4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_max(simd_long4 x, simd_long4 y) {
+#if defined __AVX512VL__
+  return _mm256_max_epi64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_max(simd_long8 x, simd_long8 y) {
+#if defined __AVX512F__
+  return _mm512_max_epi64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_max(simd_ulong2 x, simd_ulong2 y) {
+#if defined __AVX512VL__
+  return _mm_max_epu64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_max(simd_ulong3 x, simd_ulong3 y) {
+  return simd_make_ulong3(simd_max(simd_make_ulong4_undef(x), simd_make_ulong4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_max(simd_ulong4 x, simd_ulong4 y) {
+#if defined __AVX512VL__
+  return _mm256_max_epu64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_max(simd_ulong8 x, simd_ulong8 y) {
+#if defined __AVX512F__
+  return _mm512_max_epu64(x, y);
+#else
+  return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_max(double x, double y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_max(simd_double2 x, simd_double2 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_max(simd_double3 x, simd_double3 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_max(simd_double4 x, simd_double4 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_max(simd_double8 x, simd_double8 y) {
+  return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_clamp(simd_char2 x, simd_char2 min, simd_char2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char3 simd_clamp(simd_char3 x, simd_char3 min, simd_char3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char4 simd_clamp(simd_char4 x, simd_char4 min, simd_char4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char8 simd_clamp(simd_char8 x, simd_char8 min, simd_char8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char16 simd_clamp(simd_char16 x, simd_char16 min, simd_char16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char32 simd_clamp(simd_char32 x, simd_char32 min, simd_char32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char64 simd_clamp(simd_char64 x, simd_char64 min, simd_char64 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_clamp(simd_uchar2 x, simd_uchar2 min, simd_uchar2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_clamp(simd_uchar3 x, simd_uchar3 min, simd_uchar3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_clamp(simd_uchar4 x, simd_uchar4 min, simd_uchar4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_clamp(simd_uchar8 x, simd_uchar8 min, simd_uchar8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_clamp(simd_uchar16 x, simd_uchar16 min, simd_uchar16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_clamp(simd_uchar32 x, simd_uchar32 min, simd_uchar32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_clamp(simd_uchar64 x, simd_uchar64 min, simd_uchar64 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short2 simd_clamp(simd_short2 x, simd_short2 min, simd_short2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short3 simd_clamp(simd_short3 x, simd_short3 min, simd_short3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short4 simd_clamp(simd_short4 x, simd_short4 min, simd_short4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short8 simd_clamp(simd_short8 x, simd_short8 min, simd_short8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short16 simd_clamp(simd_short16 x, simd_short16 min, simd_short16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short32 simd_clamp(simd_short32 x, simd_short32 min, simd_short32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_clamp(simd_ushort2 x, simd_ushort2 min, simd_ushort2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_clamp(simd_ushort3 x, simd_ushort3 min, simd_ushort3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_clamp(simd_ushort4 x, simd_ushort4 min, simd_ushort4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_clamp(simd_ushort8 x, simd_ushort8 min, simd_ushort8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_clamp(simd_ushort16 x, simd_ushort16 min, simd_ushort16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_clamp(simd_ushort32 x, simd_ushort32 min, simd_ushort32 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int2 simd_clamp(simd_int2 x, simd_int2 min, simd_int2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int3 simd_clamp(simd_int3 x, simd_int3 min, simd_int3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int4 simd_clamp(simd_int4 x, simd_int4 min, simd_int4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int8 simd_clamp(simd_int8 x, simd_int8 min, simd_int8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int16 simd_clamp(simd_int16 x, simd_int16 min, simd_int16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_clamp(simd_uint2 x, simd_uint2 min, simd_uint2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_clamp(simd_uint3 x, simd_uint3 min, simd_uint3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_clamp(simd_uint4 x, simd_uint4 min, simd_uint4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_clamp(simd_uint8 x, simd_uint8 min, simd_uint8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_clamp(simd_uint16 x, simd_uint16 min, simd_uint16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC float simd_clamp(float x, float min, float max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_clamp(simd_float2 x, simd_float2 min, simd_float2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_clamp(simd_float3 x, simd_float3 min, simd_float3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_clamp(simd_float4 x, simd_float4 min, simd_float4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_clamp(simd_float8 x, simd_float8 min, simd_float8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_clamp(simd_float16 x, simd_float16 min, simd_float16 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_clamp(simd_long2 x, simd_long2 min, simd_long2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long3 simd_clamp(simd_long3 x, simd_long3 min, simd_long3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long4 simd_clamp(simd_long4 x, simd_long4 min, simd_long4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long8 simd_clamp(simd_long8 x, simd_long8 min, simd_long8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_clamp(simd_ulong2 x, simd_ulong2 min, simd_ulong2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_clamp(simd_ulong3 x, simd_ulong3 min, simd_ulong3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_clamp(simd_ulong4 x, simd_ulong4 min, simd_ulong4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_clamp(simd_ulong8 x, simd_ulong8 min, simd_ulong8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC double simd_clamp(double x, double min, double max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_clamp(simd_double2 x, simd_double2 min, simd_double2 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_clamp(simd_double3 x, simd_double3 min, simd_double3 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_clamp(simd_double4 x, simd_double4 min, simd_double4 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_clamp(simd_double8 x, simd_double8 min, simd_double8 max) {
+  return simd_min(simd_max(x, min), max);
+}
+
+  
+static inline SIMD_CFUNC float simd_sign(float x) {
+  return (x == 0 | x != x) ? 0 : copysign(1,x);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_sign(simd_float2 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_sign(simd_float3 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_sign(simd_float4 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_sign(simd_float8 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_sign(simd_float16 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC double simd_sign(double x) {
+  return (x == 0 | x != x) ? 0 : copysign(1,x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_sign(simd_double2 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_sign(simd_double3 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_sign(simd_double4 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_sign(simd_double8 x) {
+  return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC float simd_mix(float x, float y, float t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_mix(simd_float2 x, simd_float2 y, simd_float2 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_mix(simd_float3 x, simd_float3 y, simd_float3 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float4 simd_mix(simd_float4 x, simd_float4 y, simd_float4 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float8 simd_mix(simd_float8 x, simd_float8 y, simd_float8 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_float16 simd_mix(simd_float16 x, simd_float16 y, simd_float16 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC double simd_mix(double x, double y, double t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double2 simd_mix(simd_double2 x, simd_double2 y, simd_double2 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_mix(simd_double3 x, simd_double3 y, simd_double3 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double4 simd_mix(simd_double4 x, simd_double4 y, simd_double4 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC simd_double8 simd_mix(simd_double8 x, simd_double8 y, simd_double8 t) {
+  return x + t*(y - x);
+}
+  
+static inline SIMD_CFUNC float simd_recip(float x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_recip(simd_float2 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_recip(simd_float3 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float4 simd_recip(simd_float4 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_recip(simd_float8 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_recip(simd_float16 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_recip(double x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_recip(simd_double2 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 simd_recip(simd_double3 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double4 simd_recip(simd_double4 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 simd_recip(simd_double8 x) {
+#if __FAST_MATH__
+  return simd_fast_recip(x);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_fast_recip(float x) {
+#if defined __AVX512VL__
+  simd_float4 x4 = simd_make_float4(x);
+  return ((simd_float4)_mm_rcp14_ss(x4, x4)).x;
+#elif defined __SSE__
+  return ((simd_float4)_mm_rcp_ss(simd_make_float4(x))).x;
+#elif defined __ARM_NEON__
+  return simd_fast_recip(simd_make_float2_undef(x)).x;
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fast_recip(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_fast_recip(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = vrecpe_f32(x);
+  return r * vrecps_f32(x, r);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fast_recip(simd_float3 x) {
+  return simd_make_float3(simd_fast_recip(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fast_recip(simd_float4 x) {
+#if defined __AVX512VL__
+  return _mm_rcp14_ps(x);
+#elif defined __SSE__
+  return _mm_rcp_ps(x);
+#elif defined __ARM_NEON__
+  simd_float4 r = vrecpeq_f32(x);
+  return r * vrecpsq_f32(x, r);
+#else
+  return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fast_recip(simd_float8 x) {
+#if defined __AVX512VL__
+  return _mm256_rcp14_ps(x);
+#elif defined __AVX__
+  return _mm256_rcp_ps(x);
+#else
+  return simd_make_float8(simd_fast_recip(x.lo), simd_fast_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fast_recip(simd_float16 x) {
+#if defined __AVX512F__
+  return _mm512_rcp14_ps(x);
+#else
+  return simd_make_float16(simd_fast_recip(x.lo), simd_fast_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_fast_recip(double x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fast_recip(simd_double2 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fast_recip(simd_double3 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fast_recip(simd_double4 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fast_recip(simd_double8 x) {
+  return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC float simd_precise_recip(float x) {
+#if defined __SSE__
+  float r = simd_fast_recip(x);
+  return r*(2 - (x == 0 ? -INFINITY : x)*r);
+#elif defined __ARM_NEON__
+  return simd_precise_recip(simd_make_float2_undef(x)).x;
+#else
+  return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_precise_recip(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_precise_recip(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = simd_fast_recip(x);
+  return r*vrecps_f32(x, r);
+#else
+  return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_precise_recip(simd_float3 x) {
+  return simd_make_float3(simd_precise_recip(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_precise_recip(simd_float4 x) {
+#if defined __SSE__
+  simd_float4 r = simd_fast_recip(x);
+  return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#elif defined __ARM_NEON__
+  simd_float4 r = simd_fast_recip(x);
+  return r*vrecpsq_f32(x, r);
+#else
+  return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_precise_recip(simd_float8 x) {
+#if defined __AVX__
+  simd_float8 r = simd_fast_recip(x);
+  return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#else
+  return simd_make_float8(simd_precise_recip(x.lo), simd_precise_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_precise_recip(simd_float16 x) {
+#if defined __AVX512F__
+  simd_float16 r = simd_fast_recip(x);
+  return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#else
+  return simd_make_float16(simd_precise_recip(x.lo), simd_precise_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_precise_recip(double x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double2 simd_precise_recip(simd_double2 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double3 simd_precise_recip(simd_double3 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double4 simd_precise_recip(simd_double4 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double8 simd_precise_recip(simd_double8 x) {
+  return 1/x;
+}
+
+static inline SIMD_CFUNC float simd_rsqrt(float x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_rsqrt(simd_float2 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_rsqrt(simd_float3 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float4 simd_rsqrt(simd_float4 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float8 simd_rsqrt(simd_float8 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float16 simd_rsqrt(simd_float16 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC double simd_rsqrt(double x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double2 simd_rsqrt(simd_double2 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_rsqrt(simd_double3 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double4 simd_rsqrt(simd_double4 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_double8 simd_rsqrt(simd_double8 x) {
+#if __FAST_MATH__
+  return simd_fast_rsqrt(x);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC float simd_fast_rsqrt(float x) {
+#if defined __AVX512VL__
+  simd_float4 x4 = simd_make_float4(x);
+  return ((simd_float4)_mm_rsqrt14_ss(x4, x4)).x;
+#elif defined __SSE__
+  return ((simd_float4)_mm_rsqrt_ss(simd_make_float4(x))).x;
+#elif defined __ARM_NEON__
+  return simd_fast_rsqrt(simd_make_float2_undef(x)).x;
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fast_rsqrt(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_fast_rsqrt(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = vrsqrte_f32(x);
+  return r * vrsqrts_f32(x, r*r);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fast_rsqrt(simd_float3 x) {
+  return simd_make_float3(simd_fast_rsqrt(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fast_rsqrt(simd_float4 x) {
+#if defined __AVX512VL__
+  return _mm_rsqrt14_ps(x);
+#elif defined __SSE__
+  return _mm_rsqrt_ps(x);
+#elif defined __ARM_NEON__
+  simd_float4 r = vrsqrteq_f32(x);
+  return r * vrsqrtsq_f32(x, r*r);
+#else
+  return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fast_rsqrt(simd_float8 x) {
+#if defined __AVX512VL__
+  return _mm256_rsqrt14_ps(x);
+#elif defined __AVX__
+  return _mm256_rsqrt_ps(x);
+#else
+  return simd_make_float8(simd_fast_rsqrt(x.lo), simd_fast_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fast_rsqrt(simd_float16 x) {
+#if defined __AVX512F__
+  return _mm512_rsqrt14_ps(x);
+#else
+  return simd_make_float16(simd_fast_rsqrt(x.lo), simd_fast_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_fast_rsqrt(double x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fast_rsqrt(simd_double2 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fast_rsqrt(simd_double3 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fast_rsqrt(simd_double4 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fast_rsqrt(simd_double8 x) {
+  return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC float simd_precise_rsqrt(float x) {
+#if defined __SSE__
+  float r = simd_fast_rsqrt(x);
+  return r*(1.5f - 0.5f*(r == INFINITY ? -INFINITY : x)*r*r);
+#elif defined __ARM_NEON__
+  return simd_precise_rsqrt(simd_make_float2_undef(x)).x;
+#else
+  return 1/sqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_precise_rsqrt(simd_float2 x) {
+#if defined __SSE__
+  return simd_make_float2(simd_precise_rsqrt(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+  simd_float2 r = simd_fast_rsqrt(x);
+  return r*vrsqrts_f32(x, r*r);
+#else
+  return 1/__tg_sqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_precise_rsqrt(simd_float3 x) {
+  return simd_make_float3(simd_precise_rsqrt(simd_make_float4_undef(x)));
+}
+  
+static inline SIMD_CFUNC simd_float4 simd_precise_rsqrt(simd_float4 x) {
+#if defined __SSE__
+  simd_float4 r = simd_fast_rsqrt(x);
+  return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#elif defined __ARM_NEON__
+  simd_float4 r = simd_fast_rsqrt(x);
+  return r*vrsqrtsq_f32(x, r*r);
+#else
+  return 1/__tg_sqrt(x);
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float8 simd_precise_rsqrt(simd_float8 x) {
+#if defined __AVX__
+  simd_float8 r = simd_fast_rsqrt(x);
+  return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#else
+  return simd_make_float8(simd_precise_rsqrt(x.lo), simd_precise_rsqrt(x.hi));
+#endif
+}
+  
+static inline SIMD_CFUNC simd_float16 simd_precise_rsqrt(simd_float16 x) {
+#if defined __AVX512F__
+  simd_float16 r = simd_fast_rsqrt(x);
+  return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#else
+  return simd_make_float16(simd_precise_rsqrt(x.lo), simd_precise_rsqrt(x.hi));
+#endif
+}
+  
+static inline SIMD_CFUNC double simd_precise_rsqrt(double x) {
+  return 1/sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double2 simd_precise_rsqrt(simd_double2 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_precise_rsqrt(simd_double3 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double4 simd_precise_rsqrt(simd_double4 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC simd_double8 simd_precise_rsqrt(simd_double8 x) {
+  return 1/__tg_sqrt(x);
+}
+  
+static inline SIMD_CFUNC float simd_fract(float x) {
+  return fmin(x - floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fract(simd_float2 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fract(simd_float3 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fract(simd_float4 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fract(simd_float8 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fract(simd_float16 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC double simd_fract(double x) {
+  return fmin(x - floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fract(simd_double2 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fract(simd_double3 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fract(simd_double4 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fract(simd_double8 x) {
+  return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC float simd_step(float edge, float x) {
+  return !(x < edge);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_step(simd_float2 edge, simd_float2 x) {
+  return simd_bitselect((simd_float2)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_step(simd_float3 edge, simd_float3 x) {
+  return simd_bitselect((simd_float3)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_step(simd_float4 edge, simd_float4 x) {
+  return simd_bitselect((simd_float4)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_step(simd_float8 edge, simd_float8 x) {
+  return simd_bitselect((simd_float8)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_step(simd_float16 edge, simd_float16 x) {
+  return simd_bitselect((simd_float16)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC double simd_step(double edge, double x) {
+  return !(x < edge);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_step(simd_double2 edge, simd_double2 x) {
+  return simd_bitselect((simd_double2)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_step(simd_double3 edge, simd_double3 x) {
+  return simd_bitselect((simd_double3)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_step(simd_double4 edge, simd_double4 x) {
+  return simd_bitselect((simd_double4)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_step(simd_double8 edge, simd_double8 x) {
+  return simd_bitselect((simd_double8)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC float simd_smoothstep(float edge0, float edge1, float x) {
+  float t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_smoothstep(simd_float2 edge0, simd_float2 edge1, simd_float2 x) {
+  simd_float2 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_smoothstep(simd_float3 edge0, simd_float3 edge1, simd_float3 x) {
+  simd_float3 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_smoothstep(simd_float4 edge0, simd_float4 edge1, simd_float4 x) {
+  simd_float4 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_smoothstep(simd_float8 edge0, simd_float8 edge1, simd_float8 x) {
+  simd_float8 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_smoothstep(simd_float16 edge0, simd_float16 edge1, simd_float16 x) {
+  simd_float16 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC double simd_smoothstep(double edge0, double edge1, double x) {
+  double t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_smoothstep(simd_double2 edge0, simd_double2 edge1, simd_double2 x) {
+  simd_double2 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_smoothstep(simd_double3 edge0, simd_double3 edge1, simd_double3 x) {
+  simd_double3 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_smoothstep(simd_double4 edge0, simd_double4 edge1, simd_double4 x) {
+  simd_double4 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_smoothstep(simd_double8 edge0, simd_double8 edge1, simd_double8 x) {
+  simd_double8 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+  return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char64 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar64 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort32 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float16 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double2 x) {
+  return x.x + x.y;
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double3 x) {
+  return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double4 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double8 x) {
+  return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char3 x) {
+  char t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char64 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar3 x) {
+  unsigned char t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar64 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short3 x) {
+  short t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort3 x) {
+  unsigned short t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort32 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int3 x) {
+  int t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint3 x) {
+  unsigned int t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float2 x) {
+  return fmin(x.x, x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float3 x) {
+  return fmin(fmin(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float16 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long3 x) {
+  simd_long1 t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong2 x) {
+  return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong3 x) {
+  simd_ulong1 t = x.z < x.x ? x.z : x.x;
+  return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double2 x) {
+  return fmin(x.x, x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double3 x) {
+  return fmin(fmin(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double4 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double8 x) {
+  return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char3 x) {
+  char t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char64 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar3 x) {
+  unsigned char t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar64 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short3 x) {
+  short t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort3 x) {
+  unsigned short t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort32 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int3 x) {
+  int t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint3 x) {
+  unsigned int t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float2 x) {
+  return fmax(x.x, x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float3 x) {
+  return fmax(fmax(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float16 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long3 x) {
+  simd_long1 t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong2 x) {
+  return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong3 x) {
+  simd_ulong1 t = x.z > x.x ? x.z : x.x;
+  return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double2 x) {
+  return fmax(x.x, x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double3 x) {
+  return fmax(fmax(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double4 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double8 x) {
+  return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_COMMON_HEADER */
+\ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/conversion.h b/lib/libc/include/aarch64-macos-gnu/simd/conversion.h
new file mode 100644
index 0000000000..6379afde05
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/conversion.h
@@ -0,0 +1,1966 @@
+/*  Copyright (c) 2014-2017 Apple, Inc. All rights reserved.
+ *
+ *  The interfaces declared in this header provide conversions between vector
+ *  types. The following functions are available:
+ *
+ *      simd_char(x)      simd_uchar(x)
+ *      simd_short(x)     simd_ushort(x)
+ *      simd_int(x)       simd_uint(x)
+ *      simd_long(x)      simd_ulong(x)
+ *      simd_float(x)
+ *      simd_double(x)
+ *
+ *  Each of these functions converts x to a vector whose elements have the
+ *  type named by the function, with the same number of elements as x. Unlike
+ *  a vector cast, these functions convert the elements to the new element
+ *  type. These conversions behave exactly as C scalar conversions, except
+ *  that conversions from integer vector types to signed integer vector types
+ *  are guaranteed to wrap modulo 2^N (where N is the number of bits in an
+ *  element of the result type).
+ *
+ *  For integer vector types, saturating conversions are also available:
+ *
+ *      simd_char_sat(x)      simd_uchar_sat(x)
+ *      simd_short_sat(x)     simd_ushort_sat(x)
+ *      simd_int_sat(x)       simd_uint_sat(x)
+ *      simd_long_sat(x)      simd_ulong_sat(x)
+ *
+ *  These conversions clamp x to the representable range of the result type
+ *  before converting.
+ *
+ *  Unlike most vector operations in <simd/>, there are no abbreviated C++
+ *  names for these functions in the simd:: namespace.
+ */
+
+#ifndef __SIMD_CONVERSION_HEADER__
+#define __SIMD_CONVERSION_HEADER__
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_types.h>
+#include <simd/common.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static simd_char2  SIMD_CFUNC simd_char(simd_char2    __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_char3    __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_char4    __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_char8    __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_char16   __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_char32   __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_uchar2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_uchar3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_uchar4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_uchar8   __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_uchar16  __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_uchar32  __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_short2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_short3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_short4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_short8   __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_short16  __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_short32  __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_ushort2  __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_ushort3  __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_ushort4  __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_ushort8  __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_ushort16 __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_ushort32 __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_int2     __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_int3     __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_int4     __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_int8     __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_int16    __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_uint2    __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_uint3    __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_uint4    __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_uint8    __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_uint16   __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_float2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_float3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_float4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_float8   __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_float16  __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_long2    __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_long3    __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_long4    __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_long8    __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_ulong2   __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_ulong3   __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_ulong4   __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_ulong8   __x);
+static simd_char2  SIMD_CFUNC simd_char(simd_double2  __x);
+static simd_char3  SIMD_CFUNC simd_char(simd_double3  __x);
+static simd_char4  SIMD_CFUNC simd_char(simd_double4  __x);
+static simd_char8  SIMD_CFUNC simd_char(simd_double8  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_char2    __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_char3    __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_char4    __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_char8    __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_char16   __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_char32   __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_short2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_short3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_short4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_short8   __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_short16  __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_short32  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_int2     __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_int3     __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_int4     __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_int8     __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_int16    __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_float2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_float3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_float4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_float8   __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_float16  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_long2    __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_long3    __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_long4    __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_long8    __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_double2  __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_double3  __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_double4  __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_double8  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uchar2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uchar3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uchar4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uchar8   __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uchar16  __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_uchar32  __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ushort2  __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ushort3  __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ushort4  __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ushort8  __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_ushort16 __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_ushort32 __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uint2    __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uint3    __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uint4    __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uint8    __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uint16   __x);
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ulong2   __x);
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ulong3   __x);
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ulong4   __x);
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ulong8   __x);
+#define vector_char simd_char
+#define vector_char_sat simd_char_sat
+
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_char2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_char3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_char4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_char8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_char16   __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_char32   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uchar2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uchar3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uchar4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uchar8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uchar16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_uchar32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_short2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_short3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_short4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_short8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_short16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_short32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ushort2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ushort3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ushort4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ushort8  __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_ushort16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_ushort32 __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_int2     __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_int3     __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_int4     __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_int8     __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_int16    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uint2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uint3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uint4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uint8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uint16   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_float2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_float3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_float4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_float8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_float16  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_long2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_long3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_long4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_long8    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ulong2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ulong3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ulong4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ulong8   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_double2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_double3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_double4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_double8  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_char2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_char3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_char4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_char8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_char16   __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_char32   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_short2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_short3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_short4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_short8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_short16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_short32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_int2     __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_int3     __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_int4     __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_int8     __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_int16    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_float2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_float3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_float4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_float8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_float16  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_long2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_long3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_long4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_long8    __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_double2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_double3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_double4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_double8  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uchar2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uchar3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uchar4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uchar8   __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uchar16  __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_uchar32  __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ushort2  __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ushort3  __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ushort4  __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ushort8  __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_ushort16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_ushort32 __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uint2    __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uint3    __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uint4    __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uint8    __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uint16   __x);
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ulong2   __x);
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ulong3   __x);
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ulong4   __x);
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ulong8   __x);
+#define vector_uchar simd_uchar
+#define vector_uchar_sat simd_uchar_sat
+
+static simd_short2  SIMD_CFUNC simd_short(simd_char2    __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_char3    __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_char4    __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_char8    __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_char16   __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_char32   __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_uchar2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_uchar3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_uchar4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_uchar8   __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_uchar16  __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_uchar32  __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_short2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_short3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_short4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_short8   __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_short16  __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_short32  __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_ushort2  __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_ushort3  __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_ushort4  __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_ushort8  __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_ushort16 __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_ushort32 __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_int2     __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_int3     __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_int4     __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_int8     __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_int16    __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_uint2    __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_uint3    __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_uint4    __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_uint8    __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_uint16   __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_float2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_float3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_float4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_float8   __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_float16  __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_long2    __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_long3    __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_long4    __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_long8    __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_ulong2   __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_ulong3   __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_ulong4   __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_ulong8   __x);
+static simd_short2  SIMD_CFUNC simd_short(simd_double2  __x);
+static simd_short3  SIMD_CFUNC simd_short(simd_double3  __x);
+static simd_short4  SIMD_CFUNC simd_short(simd_double4  __x);
+static simd_short8  SIMD_CFUNC simd_short(simd_double8  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_char2    __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_char3    __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_char4    __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_char8    __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_char16   __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_char32   __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_short2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_short3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_short4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_short8   __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_short16  __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_short32  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_int2     __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_int3     __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_int4     __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_int8     __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_int16    __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_float2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_float3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_float4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_float8   __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_float16  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_long2    __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_long3    __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_long4    __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_long8    __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_double2  __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_double3  __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_double4  __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_double8  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uchar2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uchar3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uchar4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uchar8   __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uchar16  __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_uchar32  __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ushort2  __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ushort3  __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ushort4  __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ushort8  __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_ushort16 __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_ushort32 __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uint2    __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uint3    __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uint4    __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uint8    __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uint16   __x);
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ulong2   __x);
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ulong3   __x);
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ulong4   __x);
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ulong8   __x);
+#define vector_short simd_short
+#define vector_short_sat simd_short_sat
+
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_char2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_char3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_char4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_char8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_char16   __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_char32   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uchar2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uchar3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uchar4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uchar8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uchar16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_uchar32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_short2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_short3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_short4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_short8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_short16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_short32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ushort2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ushort3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ushort4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ushort8  __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_ushort16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_ushort32 __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_int2     __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_int3     __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_int4     __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_int8     __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_int16    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uint2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uint3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uint4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uint8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uint16   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_float2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_float3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_float4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_float8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_float16  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_long2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_long3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_long4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_long8    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ulong2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ulong3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ulong4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ulong8   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_double2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_double3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_double4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_double8  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_char2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_char3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_char4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_char8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_char16   __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_char32   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_short2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_short3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_short4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_short8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_short16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_short32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_int2     __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_int3     __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_int4     __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_int8     __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_int16    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_float2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_float3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_float4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_float8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_float16  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_long2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_long3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_long4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_long8    __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_double2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_double3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_double4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_double8  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uchar2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uchar3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uchar4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uchar8   __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uchar16  __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_uchar32  __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ushort2  __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ushort3  __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ushort4  __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ushort8  __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_ushort16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_ushort32 __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uint2    __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uint3    __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uint4    __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uint8    __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uint16   __x);
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ulong2   __x);
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ulong3   __x);
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ulong4   __x);
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ulong8   __x);
+#define vector_ushort simd_ushort
+#define vector_ushort_sat simd_ushort_sat
+
+static simd_int2  SIMD_CFUNC simd_int(simd_char2    __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_char3    __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_char4    __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_char8    __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_char16   __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_uchar2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_uchar3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_uchar4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_uchar8   __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_uchar16  __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_short2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_short3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_short4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_short8   __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_short16  __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_ushort2  __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_ushort3  __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_ushort4  __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_ushort8  __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_ushort16 __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_int2     __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_int3     __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_int4     __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_int8     __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_int16    __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_uint2    __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_uint3    __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_uint4    __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_uint8    __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_uint16   __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_float2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_float3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_float4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_float8   __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_float16  __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_long2    __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_long3    __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_long4    __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_long8    __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_ulong2   __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_ulong3   __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_ulong4   __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_ulong8   __x);
+static simd_int2  SIMD_CFUNC simd_int(simd_double2  __x);
+static simd_int3  SIMD_CFUNC simd_int(simd_double3  __x);
+static simd_int4  SIMD_CFUNC simd_int(simd_double4  __x);
+static simd_int8  SIMD_CFUNC simd_int(simd_double8  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_char2    __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_char3    __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_char4    __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_char8    __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_char16   __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_short2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_short3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_short4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_short8   __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_short16  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_int2     __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_int3     __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_int4     __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_int8     __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_int16    __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_float2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_float3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_float4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_float8   __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_float16  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_long2    __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_long3    __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_long4    __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_long8    __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_double2  __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_double3  __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_double4  __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_double8  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uchar2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uchar3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uchar4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uchar8   __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uchar16  __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ushort2  __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ushort3  __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ushort4  __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ushort8  __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_ushort16 __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uint2    __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uint3    __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uint4    __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uint8    __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uint16   __x);
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ulong2   __x);
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ulong3   __x);
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ulong4   __x);
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ulong8   __x);
+static simd_int2  SIMD_CFUNC simd_int_rte(simd_float2   __x);
+static simd_int3  SIMD_CFUNC simd_int_rte(simd_float3   __x);
+static simd_int4  SIMD_CFUNC simd_int_rte(simd_float4   __x);
+static simd_int8  SIMD_CFUNC simd_int_rte(simd_float8   __x);
+static simd_int16 SIMD_CFUNC simd_int_rte(simd_float16  __x);
+#define vector_int simd_int
+#define vector_int_sat simd_int_sat
+
+static simd_uint2  SIMD_CFUNC simd_uint(simd_char2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_char3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_char4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_char8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_char16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uchar2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uchar3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uchar4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uchar8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uchar16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_short2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_short3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_short4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_short8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_short16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ushort2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ushort3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ushort4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ushort8  __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_ushort16 __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_int2     __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_int3     __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_int4     __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_int8     __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_int16    __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uint2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uint3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uint4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uint8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uint16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_float2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_float3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_float4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_float8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_float16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_long2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_long3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_long4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_long8    __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ulong2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ulong3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ulong4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ulong8   __x);
+static simd_uint2  SIMD_CFUNC simd_uint(simd_double2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint(simd_double3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint(simd_double4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint(simd_double8  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_char2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_char3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_char4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_char8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_char16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_short2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_short3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_short4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_short8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_short16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_int2     __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_int3     __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_int4     __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_int8     __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_int16    __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_float2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_float3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_float4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_float8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_float16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_long2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_long3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_long4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_long8    __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_double2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_double3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_double4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_double8  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uchar2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uchar3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uchar4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uchar8   __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uchar16  __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ushort2  __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ushort3  __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ushort4  __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ushort8  __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_ushort16 __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uint2    __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uint3    __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uint4    __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uint8    __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uint16   __x);
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ulong2   __x);
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ulong3   __x);
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ulong4   __x);
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ulong8   __x);
+#define vector_uint simd_uint
+#define vector_uint_sat simd_uint_sat
+
+static simd_float2  SIMD_CFUNC simd_float(simd_char2    __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_char3    __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_char4    __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_char8    __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_char16   __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_uchar2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_uchar3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_uchar4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_uchar8   __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_uchar16  __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_short2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_short3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_short4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_short8   __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_short16  __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_ushort2  __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_ushort3  __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_ushort4  __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_ushort8  __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_ushort16 __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_int2     __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_int3     __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_int4     __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_int8     __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_int16    __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_uint2    __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_uint3    __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_uint4    __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_uint8    __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_uint16   __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_float2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_float3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_float4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_float8   __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_float16  __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_long2    __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_long3    __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_long4    __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_long8    __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_ulong2   __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_ulong3   __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_ulong4   __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_ulong8   __x);
+static simd_float2  SIMD_CFUNC simd_float(simd_double2  __x);
+static simd_float3  SIMD_CFUNC simd_float(simd_double3  __x);
+static simd_float4  SIMD_CFUNC simd_float(simd_double4  __x);
+static simd_float8  SIMD_CFUNC simd_float(simd_double8  __x);
+#define vector_float simd_float
+
+static simd_long2  SIMD_CFUNC simd_long(simd_char2    __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_char3    __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_char4    __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_char8    __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_uchar2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_uchar3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_uchar4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_uchar8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_short2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_short3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_short4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_short8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_ushort2  __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_ushort3  __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_ushort4  __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_ushort8  __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_int2     __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_int3     __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_int4     __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_int8     __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_uint2    __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_uint3    __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_uint4    __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_uint8    __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_float2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_float3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_float4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_float8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_long2    __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_long3    __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_long4    __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_long8    __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_ulong2   __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_ulong3   __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_ulong4   __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_ulong8   __x);
+static simd_long2  SIMD_CFUNC simd_long(simd_double2  __x);
+static simd_long3  SIMD_CFUNC simd_long(simd_double3  __x);
+static simd_long4  SIMD_CFUNC simd_long(simd_double4  __x);
+static simd_long8  SIMD_CFUNC simd_long(simd_double8  __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_char2    __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_char3    __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_char4    __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_char8    __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_short2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_short3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_short4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_short8   __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_int2     __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_int3     __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_int4     __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_int8     __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_float2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_float3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_float4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_float8   __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_long2    __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_long3    __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_long4    __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_long8    __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_double2  __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_double3  __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_double4  __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_double8  __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uchar2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uchar3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uchar4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uchar8   __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ushort2  __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ushort3  __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ushort4  __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ushort8  __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uint2    __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uint3    __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uint4    __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uint8    __x);
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ulong2   __x);
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ulong3   __x);
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ulong4   __x);
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ulong8   __x);
+static simd_long2  SIMD_CFUNC simd_long_rte(simd_double2  __x);
+static simd_long3  SIMD_CFUNC simd_long_rte(simd_double3  __x);
+static simd_long4  SIMD_CFUNC simd_long_rte(simd_double4  __x);
+static simd_long8  SIMD_CFUNC simd_long_rte(simd_double8  __x);
+#define vector_long simd_long
+#define vector_long_sat simd_long_sat
+
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_char2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_char3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_char4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_char8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uchar2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uchar3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uchar4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uchar8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_short2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_short3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_short4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_short8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ushort2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ushort3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ushort4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ushort8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_int2     __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_int3     __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_int4     __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_int8     __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uint2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uint3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uint4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uint8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_float2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_float3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_float4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_float8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_long2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_long3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_long4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_long8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ulong2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ulong3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ulong4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ulong8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_double2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_double3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_double4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_double8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_char2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_char3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_char4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_char8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_short2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_short3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_short4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_short8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_int2     __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_int3     __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_int4     __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_int8     __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_float2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_float3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_float4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_float8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_long2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_long3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_long4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_long8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_double2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_double3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_double4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_double8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uchar2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uchar3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uchar4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uchar8   __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ushort2  __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ushort3  __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ushort4  __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ushort8  __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uint2    __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uint3    __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uint4    __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uint8    __x);
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ulong2   __x);
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ulong3   __x);
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ulong4   __x);
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ulong8   __x);
+#define vector_ulong simd_ulong
+#define vector_ulong_sat simd_ulong_sat
+
+static simd_double2  SIMD_CFUNC simd_double(simd_char2    __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_char3    __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_char4    __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_char8    __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_uchar2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_uchar3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_uchar4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_uchar8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_short2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_short3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_short4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_short8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_ushort2  __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_ushort3  __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_ushort4  __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_ushort8  __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_int2     __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_int3     __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_int4     __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_int8     __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_uint2    __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_uint3    __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_uint4    __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_uint8    __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_float2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_float3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_float4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_float8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_long2    __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_long3    __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_long4    __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_long8    __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_ulong2   __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_ulong3   __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_ulong4   __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_ulong8   __x);
+static simd_double2  SIMD_CFUNC simd_double(simd_double2  __x);
+static simd_double3  SIMD_CFUNC simd_double(simd_double3  __x);
+static simd_double4  SIMD_CFUNC simd_double(simd_double4  __x);
+static simd_double8  SIMD_CFUNC simd_double(simd_double8  __x);
+#define vector_double simd_double
+
+static simd_char2   SIMD_CFUNC vector2(char           __x, char           __y) { return (  simd_char2){__x, __y}; }
+static simd_uchar2  SIMD_CFUNC vector2(unsigned char  __x, unsigned char  __y) { return ( simd_uchar2){__x, __y}; }
+static simd_short2  SIMD_CFUNC vector2(short          __x, short          __y) { return ( simd_short2){__x, __y}; }
+static simd_ushort2 SIMD_CFUNC vector2(unsigned short __x, unsigned short __y) { return (simd_ushort2){__x, __y}; }
+static simd_int2    SIMD_CFUNC vector2(int            __x, int            __y) { return (   simd_int2){__x, __y}; }
+static simd_uint2   SIMD_CFUNC vector2(unsigned int   __x, unsigned int   __y) { return (  simd_uint2){__x, __y}; }
+static simd_float2  SIMD_CFUNC vector2(float          __x, float          __y) { return ( simd_float2){__x, __y}; }
+static simd_long2   SIMD_CFUNC vector2(simd_long1   __x, simd_long1   __y) { return (  simd_long2){__x, __y}; }
+static simd_ulong2  SIMD_CFUNC vector2(simd_ulong1  __x, simd_ulong1  __y) { return ( simd_ulong2){__x, __y}; }
+static simd_double2 SIMD_CFUNC vector2(double         __x, double         __y) { return (simd_double2){__x, __y}; }
+
+static simd_char3   SIMD_CFUNC vector3(char           __x, char           __y, char           __z) { return (  simd_char3){__x, __y, __z}; }
+static simd_uchar3  SIMD_CFUNC vector3(unsigned char  __x, unsigned char  __y, unsigned char  __z) { return ( simd_uchar3){__x, __y, __z}; }
+static simd_short3  SIMD_CFUNC vector3(short          __x, short          __y, short          __z) { return ( simd_short3){__x, __y, __z}; }
+static simd_ushort3 SIMD_CFUNC vector3(unsigned short __x, unsigned short __y, unsigned short __z) { return (simd_ushort3){__x, __y, __z}; }
+static simd_int3    SIMD_CFUNC vector3(int            __x, int            __y, int            __z) { return (   simd_int3){__x, __y, __z}; }
+static simd_uint3   SIMD_CFUNC vector3(unsigned int   __x, unsigned int   __y, unsigned int   __z) { return (  simd_uint3){__x, __y, __z}; }
+static simd_float3  SIMD_CFUNC vector3(float          __x, float          __y, float          __z) { return ( simd_float3){__x, __y, __z}; }
+static simd_long3   SIMD_CFUNC vector3(simd_long1   __x, simd_long1   __y, simd_long1   __z) { return (  simd_long3){__x, __y, __z}; }
+static simd_ulong3  SIMD_CFUNC vector3(simd_ulong1  __x, simd_ulong1  __y, simd_ulong1  __z) { return ( simd_ulong3){__x, __y, __z}; }
+static simd_double3 SIMD_CFUNC vector3(double         __x, double         __y, double         __z) { return (simd_double3){__x, __y, __z}; }
+
+static simd_char3   SIMD_CFUNC vector3(simd_char2   __xy, char           __z) { simd_char3   __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_uchar3  SIMD_CFUNC vector3(simd_uchar2  __xy, unsigned char  __z) { simd_uchar3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_short3  SIMD_CFUNC vector3(simd_short2  __xy, short          __z) { simd_short3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_ushort3 SIMD_CFUNC vector3(simd_ushort2 __xy, unsigned short __z) { simd_ushort3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_int3    SIMD_CFUNC vector3(simd_int2    __xy, int            __z) { simd_int3    __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_uint3   SIMD_CFUNC vector3(simd_uint2   __xy, unsigned int   __z) { simd_uint3   __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_float3  SIMD_CFUNC vector3(simd_float2  __xy, float          __z) { simd_float3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_long3   SIMD_CFUNC vector3(simd_long2   __xy, simd_long1   __z) { simd_long3   __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_ulong3  SIMD_CFUNC vector3(simd_ulong2  __xy, simd_ulong1  __z) { simd_ulong3  __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_double3 SIMD_CFUNC vector3(simd_double2 __xy, double         __z) { simd_double3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+
+static simd_char4   SIMD_CFUNC vector4(char           __x, char           __y, char           __z, char           __w) { return (  simd_char4){__x, __y, __z, __w}; }
+static simd_uchar4  SIMD_CFUNC vector4(unsigned char  __x, unsigned char  __y, unsigned char  __z, unsigned char  __w) { return ( simd_uchar4){__x, __y, __z, __w}; }
+static simd_short4  SIMD_CFUNC vector4(short          __x, short          __y, short          __z, short          __w) { return ( simd_short4){__x, __y, __z, __w}; }
+static simd_ushort4 SIMD_CFUNC vector4(unsigned short __x, unsigned short __y, unsigned short __z, unsigned short __w) { return (simd_ushort4){__x, __y, __z, __w}; }
+static simd_int4    SIMD_CFUNC vector4(int            __x, int            __y, int            __z, int            __w) { return (   simd_int4){__x, __y, __z, __w}; }
+static simd_uint4   SIMD_CFUNC vector4(unsigned int   __x, unsigned int   __y, unsigned int   __z, unsigned int   __w) { return (  simd_uint4){__x, __y, __z, __w}; }
+static simd_float4  SIMD_CFUNC vector4(float          __x, float          __y, float          __z, float          __w) { return ( simd_float4){__x, __y, __z, __w}; }
+static simd_long4   SIMD_CFUNC vector4(simd_long1   __x, simd_long1   __y, simd_long1   __z, simd_long1   __w) { return (  simd_long4){__x, __y, __z, __w}; }
+static simd_ulong4  SIMD_CFUNC vector4(simd_ulong1  __x, simd_ulong1  __y, simd_ulong1  __z, simd_ulong1  __w) { return ( simd_ulong4){__x, __y, __z, __w}; }
+static simd_double4 SIMD_CFUNC vector4(double         __x, double         __y, double         __z, double         __w) { return (simd_double4){__x, __y, __z, __w}; }
+
+static simd_char4   SIMD_CFUNC vector4(simd_char2   __xy, simd_char2   __zw) { simd_char4   __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_uchar4  SIMD_CFUNC vector4(simd_uchar2  __xy, simd_uchar2  __zw) { simd_uchar4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_short4  SIMD_CFUNC vector4(simd_short2  __xy, simd_short2  __zw) { simd_short4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_ushort4 SIMD_CFUNC vector4(simd_ushort2 __xy, simd_ushort2 __zw) { simd_ushort4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_int4    SIMD_CFUNC vector4(simd_int2    __xy, simd_int2    __zw) { simd_int4    __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_uint4   SIMD_CFUNC vector4(simd_uint2   __xy, simd_uint2   __zw) { simd_uint4   __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_float4  SIMD_CFUNC vector4(simd_float2  __xy, simd_float2  __zw) { simd_float4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_long4   SIMD_CFUNC vector4(simd_long2   __xy, simd_long2   __zw) { simd_long4   __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_ulong4  SIMD_CFUNC vector4(simd_ulong2  __xy, simd_ulong2  __zw) { simd_ulong4  __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_double4 SIMD_CFUNC vector4(simd_double2 __xy, simd_double2 __zw) { simd_double4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+
+static simd_char4   SIMD_CFUNC vector4(simd_char3   __xyz, char           __w) { simd_char4   __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_uchar4  SIMD_CFUNC vector4(simd_uchar3  __xyz, unsigned char  __w) { simd_uchar4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_short4  SIMD_CFUNC vector4(simd_short3  __xyz, short          __w) { simd_short4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_ushort4 SIMD_CFUNC vector4(simd_ushort3 __xyz, unsigned short __w) { simd_ushort4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_int4    SIMD_CFUNC vector4(simd_int3    __xyz, int            __w) { simd_int4    __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_uint4   SIMD_CFUNC vector4(simd_uint3   __xyz, unsigned int   __w) { simd_uint4   __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_float4  SIMD_CFUNC vector4(simd_float3  __xyz, float          __w) { simd_float4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_long4   SIMD_CFUNC vector4(simd_long3   __xyz, simd_long1   __w) { simd_long4   __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_ulong4  SIMD_CFUNC vector4(simd_ulong3  __xyz, simd_ulong1  __w) { simd_ulong4  __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_double4 SIMD_CFUNC vector4(simd_double3 __xyz, double         __w) { simd_double4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+
+static simd_char8   SIMD_CFUNC vector8(simd_char4   __lo, simd_char4   __hi) { simd_char8   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar8  SIMD_CFUNC vector8(simd_uchar4  __lo, simd_uchar4  __hi) { simd_uchar8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short8  SIMD_CFUNC vector8(simd_short4  __lo, simd_short4  __hi) { simd_short8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort8 SIMD_CFUNC vector8(simd_ushort4 __lo, simd_ushort4 __hi) { simd_ushort8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_int8    SIMD_CFUNC vector8(simd_int4    __lo, simd_int4    __hi) { simd_int8    __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uint8   SIMD_CFUNC vector8(simd_uint4   __lo, simd_uint4   __hi) { simd_uint8   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_float8  SIMD_CFUNC vector8(simd_float4  __lo, simd_float4  __hi) { simd_float8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_long8   SIMD_CFUNC vector8(simd_long4   __lo, simd_long4   __hi) { simd_long8   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ulong8  SIMD_CFUNC vector8(simd_ulong4  __lo, simd_ulong4  __hi) { simd_ulong8  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_double8 SIMD_CFUNC vector8(simd_double4 __lo, simd_double4 __hi) { simd_double8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+static simd_char16   SIMD_CFUNC vector16(simd_char8   __lo, simd_char8   __hi) { simd_char16   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar16  SIMD_CFUNC vector16(simd_uchar8  __lo, simd_uchar8  __hi) { simd_uchar16  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short16  SIMD_CFUNC vector16(simd_short8  __lo, simd_short8  __hi) { simd_short16  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort16 SIMD_CFUNC vector16(simd_ushort8 __lo, simd_ushort8 __hi) { simd_ushort16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_int16    SIMD_CFUNC vector16(simd_int8    __lo, simd_int8    __hi) { simd_int16    __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uint16   SIMD_CFUNC vector16(simd_uint8   __lo, simd_uint8   __hi) { simd_uint16   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_float16  SIMD_CFUNC vector16(simd_float8  __lo, simd_float8  __hi) { simd_float16  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+static simd_char32   SIMD_CFUNC vector32(simd_char16   __lo, simd_char16   __hi) { simd_char32   __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar32  SIMD_CFUNC vector32(simd_uchar16  __lo, simd_uchar16  __hi) { simd_uchar32  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short32  SIMD_CFUNC vector32(simd_short16  __lo, simd_short16  __hi) { simd_short32  __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort32 SIMD_CFUNC vector32(simd_ushort16 __lo, simd_ushort16 __hi) { simd_ushort32 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+#pragma mark - Implementation
+
+static simd_char2  SIMD_CFUNC simd_char(simd_char2    __x) { return __x; }
+static simd_char3  SIMD_CFUNC simd_char(simd_char3    __x) { return __x; }
+static simd_char4  SIMD_CFUNC simd_char(simd_char4    __x) { return __x; }
+static simd_char8  SIMD_CFUNC simd_char(simd_char8    __x) { return __x; }
+static simd_char16 SIMD_CFUNC simd_char(simd_char16   __x) { return __x; }
+static simd_char32 SIMD_CFUNC simd_char(simd_char32   __x) { return __x; }
+static simd_char2  SIMD_CFUNC simd_char(simd_uchar2   __x) { return (simd_char2)__x; }
+static simd_char3  SIMD_CFUNC simd_char(simd_uchar3   __x) { return (simd_char3)__x; }
+static simd_char4  SIMD_CFUNC simd_char(simd_uchar4   __x) { return (simd_char4)__x; }
+static simd_char8  SIMD_CFUNC simd_char(simd_uchar8   __x) { return (simd_char8)__x; }
+static simd_char16 SIMD_CFUNC simd_char(simd_uchar16  __x) { return (simd_char16)__x; }
+static simd_char32 SIMD_CFUNC simd_char(simd_uchar32  __x) { return (simd_char32)__x; }
+static simd_char2  SIMD_CFUNC simd_char(simd_short2   __x) { return __builtin_convertvector(__x & 0xff, simd_char2); }
+static simd_char3  SIMD_CFUNC simd_char(simd_short3   __x) { return __builtin_convertvector(__x & 0xff, simd_char3); }
+static simd_char4  SIMD_CFUNC simd_char(simd_short4   __x) { return __builtin_convertvector(__x & 0xff, simd_char4); }
+static simd_char8  SIMD_CFUNC simd_char(simd_short8   __x) { return __builtin_convertvector(__x & 0xff, simd_char8); }
+static simd_char16 SIMD_CFUNC simd_char(simd_short16  __x) { return __builtin_convertvector(__x & 0xff, simd_char16); }
+static simd_char32 SIMD_CFUNC simd_char(simd_short32  __x) { return __builtin_convertvector(__x & 0xff, simd_char32); }
+static simd_char2  SIMD_CFUNC simd_char(simd_ushort2  __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_ushort3  __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_ushort4  __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_ushort8  __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_ushort16 __x) { return simd_char(simd_short(__x)); }
+static simd_char32 SIMD_CFUNC simd_char(simd_ushort32 __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_int2     __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_int3     __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_int4     __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_int8     __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_int16    __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_uint2    __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_uint3    __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_uint4    __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_uint8    __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_uint16   __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_float2   __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_float3   __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_float4   __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_float8   __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_float16  __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_long2    __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_long3    __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_long4    __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_long8    __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_ulong2   __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_ulong3   __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_ulong4   __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_ulong8   __x) { return simd_char(simd_short(__x)); }
+static simd_char2  SIMD_CFUNC simd_char(simd_double2  __x) { return simd_char(simd_short(__x)); }
+static simd_char3  SIMD_CFUNC simd_char(simd_double3  __x) { return simd_char(simd_short(__x)); }
+static simd_char4  SIMD_CFUNC simd_char(simd_double4  __x) { return simd_char(simd_short(__x)); }
+static simd_char8  SIMD_CFUNC simd_char(simd_double8  __x) { return simd_char(simd_short(__x)); }
+    
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_char2    __x) { return __x; }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_char3    __x) { return __x; }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_char4    __x) { return __x; }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_char8    __x) { return __x; }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_char16   __x) { return __x; }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_char32   __x) { return __x; }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_short2   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_short3   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_short4   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_short8   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_short16  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_short32  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_int2     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_int3     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_int4     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_int8     __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_int16    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_float2   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_float3   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_float4   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_float8   __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_float16  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_long2    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_long3    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_long4    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_long8    __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_double2  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_double3  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_double4  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_double8  __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uchar2   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uchar3   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uchar4   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uchar8   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uchar16  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_uchar32  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ushort2  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ushort3  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ushort4  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ushort8  __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_ushort16 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_ushort32 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_uint2    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_uint3    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_uint4    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_uint8    __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uint16   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2  SIMD_CFUNC simd_char_sat(simd_ulong2   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3  SIMD_CFUNC simd_char_sat(simd_ulong3   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4  SIMD_CFUNC simd_char_sat(simd_ulong4   __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8  SIMD_CFUNC simd_char_sat(simd_ulong8   __x) { return simd_char(simd_min(__x,0x7f)); }
+    
+
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_char2    __x) { return (simd_uchar2)__x; }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_char3    __x) { return (simd_uchar3)__x; }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_char4    __x) { return (simd_uchar4)__x; }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_char8    __x) { return (simd_uchar8)__x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_char16   __x) { return (simd_uchar16)__x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_char32   __x) { return (simd_uchar32)__x; }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uchar2   __x) { return __x; }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uchar3   __x) { return __x; }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uchar4   __x) { return __x; }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uchar8   __x) { return __x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uchar16  __x) { return __x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_uchar32  __x) { return __x; }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_short2   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_short3   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_short4   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_short8   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_short16  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_short32  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ushort2  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ushort3  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ushort4  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ushort8  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_ushort16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_ushort32 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_int2     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_int3     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_int4     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_int8     __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_int16    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_uint2    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_uint3    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_uint4    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_uint8    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uint16   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_float2   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_float3   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_float4   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_float8   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_float16  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_long2    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_long3    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_long4    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_long8    __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_ulong2   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_ulong3   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_ulong4   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_ulong8   __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar(simd_double2  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar(simd_double3  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar(simd_double4  __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar(simd_double8  __x) { return simd_uchar(simd_char(__x)); }
+    
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_char2    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_char3    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_char4    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_char8    __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_char16   __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_char32   __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_short2   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_short3   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_short4   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_short8   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_short16  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_short32  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_int2     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_int3     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_int4     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_int8     __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_int16    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_float2   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_float3   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_float4   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_float8   __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_float16  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_long2    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_long3    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_long4    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_long8    __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_double2  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_double3  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_double4  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_double8  __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uchar2   __x) { return __x; }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uchar3   __x) { return __x; }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uchar4   __x) { return __x; }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uchar8   __x) { return __x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uchar16  __x) { return __x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_uchar32  __x) { return __x; }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ushort2  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ushort3  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ushort4  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ushort8  __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_ushort16 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_ushort32 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_uint2    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_uint3    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_uint4    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_uint8    __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uint16   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar2  SIMD_CFUNC simd_uchar_sat(simd_ulong2   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3  SIMD_CFUNC simd_uchar_sat(simd_ulong3   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4  SIMD_CFUNC simd_uchar_sat(simd_ulong4   __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8  SIMD_CFUNC simd_uchar_sat(simd_ulong8   __x) { return simd_uchar(simd_min(__x,0xff)); }
+    
+
+static simd_short2  SIMD_CFUNC simd_short(simd_char2    __x) { return __builtin_convertvector(__x, simd_short2); }
+static simd_short3  SIMD_CFUNC simd_short(simd_char3    __x) { return __builtin_convertvector(__x, simd_short3); }
+static simd_short4  SIMD_CFUNC simd_short(simd_char4    __x) { return __builtin_convertvector(__x, simd_short4); }
+static simd_short8  SIMD_CFUNC simd_short(simd_char8    __x) { return __builtin_convertvector(__x, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_char16   __x) { return __builtin_convertvector(__x, simd_short16); }
+static simd_short32 SIMD_CFUNC simd_short(simd_char32   __x) { return __builtin_convertvector(__x, simd_short32); }
+static simd_short2  SIMD_CFUNC simd_short(simd_uchar2   __x) { return __builtin_convertvector(__x, simd_short2); }
+static simd_short3  SIMD_CFUNC simd_short(simd_uchar3   __x) { return __builtin_convertvector(__x, simd_short3); }
+static simd_short4  SIMD_CFUNC simd_short(simd_uchar4   __x) { return __builtin_convertvector(__x, simd_short4); }
+static simd_short8  SIMD_CFUNC simd_short(simd_uchar8   __x) { return __builtin_convertvector(__x, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_uchar16  __x) { return __builtin_convertvector(__x, simd_short16); }
+static simd_short32 SIMD_CFUNC simd_short(simd_uchar32  __x) { return __builtin_convertvector(__x, simd_short32); }
+static simd_short2  SIMD_CFUNC simd_short(simd_short2   __x) { return __x; }
+static simd_short3  SIMD_CFUNC simd_short(simd_short3   __x) { return __x; }
+static simd_short4  SIMD_CFUNC simd_short(simd_short4   __x) { return __x; }
+static simd_short8  SIMD_CFUNC simd_short(simd_short8   __x) { return __x; }
+static simd_short16 SIMD_CFUNC simd_short(simd_short16  __x) { return __x; }
+static simd_short32 SIMD_CFUNC simd_short(simd_short32  __x) { return __x; }
+static simd_short2  SIMD_CFUNC simd_short(simd_ushort2  __x) { return (simd_short2)__x; }
+static simd_short3  SIMD_CFUNC simd_short(simd_ushort3  __x) { return (simd_short3)__x; }
+static simd_short4  SIMD_CFUNC simd_short(simd_ushort4  __x) { return (simd_short4)__x; }
+static simd_short8  SIMD_CFUNC simd_short(simd_ushort8  __x) { return (simd_short8)__x; }
+static simd_short16 SIMD_CFUNC simd_short(simd_ushort16 __x) { return (simd_short16)__x; }
+static simd_short32 SIMD_CFUNC simd_short(simd_ushort32 __x) { return (simd_short32)__x; }
+static simd_short2  SIMD_CFUNC simd_short(simd_int2     __x) { return __builtin_convertvector(__x & 0xffff, simd_short2); }
+static simd_short3  SIMD_CFUNC simd_short(simd_int3     __x) { return __builtin_convertvector(__x & 0xffff, simd_short3); }
+static simd_short4  SIMD_CFUNC simd_short(simd_int4     __x) { return __builtin_convertvector(__x & 0xffff, simd_short4); }
+static simd_short8  SIMD_CFUNC simd_short(simd_int8     __x) { return __builtin_convertvector(__x & 0xffff, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_int16    __x) { return __builtin_convertvector(__x & 0xffff, simd_short16); }
+static simd_short2  SIMD_CFUNC simd_short(simd_uint2    __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_uint3    __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_uint4    __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_uint8    __x) { return simd_short(simd_int(__x)); }
+static simd_short16 SIMD_CFUNC simd_short(simd_uint16   __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_float2   __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_float3   __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_float4   __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_float8   __x) { return simd_short(simd_int(__x)); }
+static simd_short16 SIMD_CFUNC simd_short(simd_float16  __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_long2    __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_long3    __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_long4    __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_long8    __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_ulong2   __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_ulong3   __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_ulong4   __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_ulong8   __x) { return simd_short(simd_int(__x)); }
+static simd_short2  SIMD_CFUNC simd_short(simd_double2  __x) { return simd_short(simd_int(__x)); }
+static simd_short3  SIMD_CFUNC simd_short(simd_double3  __x) { return simd_short(simd_int(__x)); }
+static simd_short4  SIMD_CFUNC simd_short(simd_double4  __x) { return simd_short(simd_int(__x)); }
+static simd_short8  SIMD_CFUNC simd_short(simd_double8  __x) { return simd_short(simd_int(__x)); }
+    
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_char2    __x) { return simd_short(__x); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_char3    __x) { return simd_short(__x); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_char4    __x) { return simd_short(__x); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_char8    __x) { return simd_short(__x); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_char16   __x) { return simd_short(__x); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_char32   __x) { return simd_short(__x); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_short2   __x) { return __x; }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_short3   __x) { return __x; }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_short4   __x) { return __x; }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_short8   __x) { return __x; }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_short16  __x) { return __x; }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_short32  __x) { return __x; }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_int2     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_int3     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_int4     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_int8     __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_int16    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_float2   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_float3   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_float4   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_float8   __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_float16  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_long2    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_long3    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_long4    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_long8    __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_double2  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_double3  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_double4  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_double8  __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uchar2   __x) { return simd_short(__x); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uchar3   __x) { return simd_short(__x); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uchar4   __x) { return simd_short(__x); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uchar8   __x) { return simd_short(__x); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uchar16  __x) { return simd_short(__x); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_uchar32  __x) { return simd_short(__x); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ushort2  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ushort3  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ushort4  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ushort8  __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_ushort16 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_ushort32 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_uint2    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_uint3    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_uint4    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_uint8    __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uint16   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short2  SIMD_CFUNC simd_short_sat(simd_ulong2   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3  SIMD_CFUNC simd_short_sat(simd_ulong3   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4  SIMD_CFUNC simd_short_sat(simd_ulong4   __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8  SIMD_CFUNC simd_short_sat(simd_ulong8   __x) { return simd_short(simd_min(__x,0x7fff)); }
+    
+
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_char2    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_char3    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_char4    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_char8    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_char16   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_char32   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uchar2   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uchar3   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uchar4   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uchar8   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uchar16  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_uchar32  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_short2   __x) { return (simd_ushort2)__x; }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_short3   __x) { return (simd_ushort3)__x; }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_short4   __x) { return (simd_ushort4)__x; }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_short8   __x) { return (simd_ushort8)__x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_short16  __x) { return (simd_ushort16)__x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_short32  __x) { return (simd_ushort32)__x; }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ushort2  __x) { return __x; }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ushort3  __x) { return __x; }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ushort4  __x) { return __x; }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ushort8  __x) { return __x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_ushort16 __x) { return __x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_ushort32 __x) { return __x; }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_int2     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_int3     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_int4     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_int8     __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_int16    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_uint2    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_uint3    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_uint4    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_uint8    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uint16   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_float2   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_float3   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_float4   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_float8   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_float16  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_long2    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_long3    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_long4    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_long8    __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_ulong2   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_ulong3   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_ulong4   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_ulong8   __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort(simd_double2  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort(simd_double3  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort(simd_double4  __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort(simd_double8  __x) { return simd_ushort(simd_short(__x)); }
+    
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_char2    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_char3    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_char4    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_char8    __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_char16   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_char32   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_short2   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_short3   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_short4   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_short8   __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_short16  __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_short32  __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_int2     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_int3     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_int4     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_int8     __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_int16    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_float2   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_float3   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_float4   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_float8   __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_float16  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_long2    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_long3    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_long4    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_long8    __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_double2  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_double3  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_double4  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_double8  __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uchar2   __x) { return simd_ushort(__x); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uchar3   __x) { return simd_ushort(__x); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uchar4   __x) { return simd_ushort(__x); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uchar8   __x) { return simd_ushort(__x); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uchar16  __x) { return simd_ushort(__x); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_uchar32  __x) { return simd_ushort(__x); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ushort2  __x) { return __x; }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ushort3  __x) { return __x; }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ushort4  __x) { return __x; }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ushort8  __x) { return __x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_ushort16 __x) { return __x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_ushort32 __x) { return __x; }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_uint2    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_uint3    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_uint4    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_uint8    __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uint16   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort2  SIMD_CFUNC simd_ushort_sat(simd_ulong2   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort3  SIMD_CFUNC simd_ushort_sat(simd_ulong3   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort4  SIMD_CFUNC simd_ushort_sat(simd_ulong4   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort8  SIMD_CFUNC simd_ushort_sat(simd_ulong8   __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+    
+
+static simd_int2  SIMD_CFUNC simd_int(simd_char2    __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_char3    __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_char4    __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_char8    __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_char16   __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_uchar2   __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_uchar3   __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_uchar4   __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_uchar8   __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_uchar16  __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_short2   __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_short3   __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_short4   __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_short8   __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_short16  __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_ushort2  __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_ushort3  __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_ushort4  __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_ushort8  __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_ushort16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_int2     __x) { return __x; }
+static simd_int3  SIMD_CFUNC simd_int(simd_int3     __x) { return __x; }
+static simd_int4  SIMD_CFUNC simd_int(simd_int4     __x) { return __x; }
+static simd_int8  SIMD_CFUNC simd_int(simd_int8     __x) { return __x; }
+static simd_int16 SIMD_CFUNC simd_int(simd_int16    __x) { return __x; }
+static simd_int2  SIMD_CFUNC simd_int(simd_uint2    __x) { return (simd_int2)__x; }
+static simd_int3  SIMD_CFUNC simd_int(simd_uint3    __x) { return (simd_int3)__x; }
+static simd_int4  SIMD_CFUNC simd_int(simd_uint4    __x) { return (simd_int4)__x; }
+static simd_int8  SIMD_CFUNC simd_int(simd_uint8    __x) { return (simd_int8)__x; }
+static simd_int16 SIMD_CFUNC simd_int(simd_uint16   __x) { return (simd_int16)__x; }
+static simd_int2  SIMD_CFUNC simd_int(simd_float2   __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_float3   __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_float4   __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_float8   __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_float16  __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2  SIMD_CFUNC simd_int(simd_long2    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_long3    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_long4    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_long8    __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int8); }
+static simd_int2  SIMD_CFUNC simd_int(simd_ulong2   __x) { return simd_int(simd_long(__x)); }
+static simd_int3  SIMD_CFUNC simd_int(simd_ulong3   __x) { return simd_int(simd_long(__x)); }
+static simd_int4  SIMD_CFUNC simd_int(simd_ulong4   __x) { return simd_int(simd_long(__x)); }
+static simd_int8  SIMD_CFUNC simd_int(simd_ulong8   __x) { return simd_int(simd_long(__x)); }
+static simd_int2  SIMD_CFUNC simd_int(simd_double2  __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3  SIMD_CFUNC simd_int(simd_double3  __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4  SIMD_CFUNC simd_int(simd_double4  __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8  SIMD_CFUNC simd_int(simd_double8  __x) { return __builtin_convertvector(__x, simd_int8); }
+    
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_char2    __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_char3    __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_char4    __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_char8    __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_char16   __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_short2   __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_short3   __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_short4   __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_short8   __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_short16  __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_int2     __x) { return __x; }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_int3     __x) { return __x; }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_int4     __x) { return __x; }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_int8     __x) { return __x; }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_int16    __x) { return __x; }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_float2   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_float3   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_float4   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_float8   __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_float16  __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_long2    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_long3    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_long4    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_long8    __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_double2  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_double3  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_double4  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_double8  __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uchar2   __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uchar3   __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uchar4   __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uchar8   __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uchar16  __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ushort2  __x) { return simd_int(__x); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ushort3  __x) { return simd_int(__x); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ushort4  __x) { return simd_int(__x); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ushort8  __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_ushort16 __x) { return simd_int(__x); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_uint2    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_uint3    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_uint4    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_uint8    __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uint16   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int2  SIMD_CFUNC simd_int_sat(simd_ulong2   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int3  SIMD_CFUNC simd_int_sat(simd_ulong3   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int4  SIMD_CFUNC simd_int_sat(simd_ulong4   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int8  SIMD_CFUNC simd_int_sat(simd_ulong8   __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+    
+static simd_int2  SIMD_CFUNC simd_int_rte(simd_float2   __x) {
+#if defined __arm64__
+  return vcvtn_s32_f32(__x);
+#else
+  return simd_make_int2(simd_int_rte(simd_make_float4_undef(__x)));
+#endif
+}
+
+static simd_int3  SIMD_CFUNC simd_int_rte(simd_float3   __x) {
+  return simd_make_int3(simd_int_rte(simd_make_float4_undef(__x)));
+}
+
+static simd_int4  SIMD_CFUNC simd_int_rte(simd_float4   __x) {
+#if defined __SSE2__
+  return _mm_cvtps_epi32(__x);
+#elif defined __arm64__
+  return vcvtnq_s32_f32(__x);
+#else
+  simd_float4 magic = __tg_copysign(0x1.0p23, __x);
+  simd_int4 x_is_small = __tg_fabs(__x) < 0x1.0p23;
+  return __builtin_convertvector(simd_bitselect(__x, (__x + magic) - magic, x_is_small & 0x7fffffff), simd_int4);
+#endif
+}
+
+static simd_int8  SIMD_CFUNC simd_int_rte(simd_float8   __x) {
+#if defined __AVX__
+  return _mm256_cvtps_epi32(__x);
+#else
+  return simd_make_int8(simd_int_rte(__x.lo), simd_int_rte(__x.hi));
+#endif
+}
+
+static simd_int16 SIMD_CFUNC simd_int_rte(simd_float16  __x) {
+#if defined __AVX512F__
+  return _mm512_cvt_roundps_epi32(__x, _MM_FROUND_RINT);
+#else
+  return simd_make_int16(simd_int_rte(__x.lo), simd_int_rte(__x.hi));
+#endif
+}
+
+static simd_uint2  SIMD_CFUNC simd_uint(simd_char2    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_char3    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_char4    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_char8    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_char16   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uchar2   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uchar3   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uchar4   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uchar8   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uchar16  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_short2   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_short3   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_short4   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_short8   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_short16  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ushort2  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ushort3  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ushort4  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ushort8  __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_ushort16 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_int2     __x) { return (simd_uint2)__x; }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_int3     __x) { return (simd_uint3)__x; }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_int4     __x) { return (simd_uint4)__x; }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_int8     __x) { return (simd_uint8)__x; }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_int16    __x) { return (simd_uint16)__x; }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_uint2    __x) { return __x; }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_uint3    __x) { return __x; }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_uint4    __x) { return __x; }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_uint8    __x) { return __x; }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uint16   __x) { return __x; }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_float2   __x) { simd_int2  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float2)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint2)0,0x80000000,__big); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_float3   __x) { simd_int3  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float3)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint3)0,0x80000000,__big); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_float4   __x) { simd_int4  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float4)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint4)0,0x80000000,__big); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_float8   __x) { simd_int8  __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float8)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint8)0,0x80000000,__big); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_float16  __x) { simd_int16 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float16)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint16)0,0x80000000,__big); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_long2    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_long3    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_long4    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_long8    __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_ulong2   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_ulong3   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_ulong4   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_ulong8   __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2  SIMD_CFUNC simd_uint(simd_double2  __x) { simd_long2 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double2)0,0x1.0p31,__big))) + simd_bitselect((simd_uint2)0,0x80000000,simd_int(__big)); }
+static simd_uint3  SIMD_CFUNC simd_uint(simd_double3  __x) { simd_long3 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double3)0,0x1.0p31,__big))) + simd_bitselect((simd_uint3)0,0x80000000,simd_int(__big)); }
+static simd_uint4  SIMD_CFUNC simd_uint(simd_double4  __x) { simd_long4 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double4)0,0x1.0p31,__big))) + simd_bitselect((simd_uint4)0,0x80000000,simd_int(__big)); }
+static simd_uint8  SIMD_CFUNC simd_uint(simd_double8  __x) { simd_long8 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double8)0,0x1.0p31,__big))) + simd_bitselect((simd_uint8)0,0x80000000,simd_int(__big)); }
+    
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_char2    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_char3    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_char4    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_char8    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_char16   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_short2   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_short3   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_short4   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_short8   __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_short16  __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_int2     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_int3     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_int4     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_int8     __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_int16    __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_float2   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_float3   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_float4   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_float8   __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_float16  __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_long2    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_long3    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_long4    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_long8    __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_double2  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_double3  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_double4  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_double8  __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uchar2   __x) { return simd_uint(__x); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uchar3   __x) { return simd_uint(__x); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uchar4   __x) { return simd_uint(__x); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uchar8   __x) { return simd_uint(__x); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uchar16  __x) { return simd_uint(__x); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ushort2  __x) { return simd_uint(__x); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ushort3  __x) { return simd_uint(__x); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ushort4  __x) { return simd_uint(__x); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ushort8  __x) { return simd_uint(__x); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_ushort16 __x) { return simd_uint(__x); }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_uint2    __x) { return __x; }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_uint3    __x) { return __x; }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_uint4    __x) { return __x; }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_uint8    __x) { return __x; }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uint16   __x) { return __x; }
+static simd_uint2  SIMD_CFUNC simd_uint_sat(simd_ulong2   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3  SIMD_CFUNC simd_uint_sat(simd_ulong3   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4  SIMD_CFUNC simd_uint_sat(simd_ulong4   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8  SIMD_CFUNC simd_uint_sat(simd_ulong8   __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+    
+
+static simd_float2  SIMD_CFUNC simd_float(simd_char2    __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_char3    __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_char4    __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_char8    __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_char16   __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_uchar2   __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_uchar3   __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_uchar4   __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_uchar8   __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_uchar16  __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_short2   __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_short3   __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_short4   __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_short8   __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_short16  __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_ushort2  __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3  SIMD_CFUNC simd_float(simd_ushort3  __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4  SIMD_CFUNC simd_float(simd_ushort4  __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8  SIMD_CFUNC simd_float(simd_ushort8  __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_ushort16 __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2  SIMD_CFUNC simd_float(simd_int2     __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_int3     __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_int4     __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_int8     __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float16 SIMD_CFUNC simd_float(simd_int16    __x) { return __builtin_convertvector(__x,simd_float16); }
+static simd_float2  SIMD_CFUNC simd_float(simd_uint2    __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_uint3    __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_uint4    __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_uint8    __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float16 SIMD_CFUNC simd_float(simd_uint16   __x) { return __builtin_convertvector(__x,simd_float16); }
+static simd_float2  SIMD_CFUNC simd_float(simd_float2   __x) { return __x; }
+static simd_float3  SIMD_CFUNC simd_float(simd_float3   __x) { return __x; }
+static simd_float4  SIMD_CFUNC simd_float(simd_float4   __x) { return __x; }
+static simd_float8  SIMD_CFUNC simd_float(simd_float8   __x) { return __x; }
+static simd_float16 SIMD_CFUNC simd_float(simd_float16  __x) { return __x; }
+static simd_float2  SIMD_CFUNC simd_float(simd_long2    __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_long3    __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_long4    __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_long8    __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float2  SIMD_CFUNC simd_float(simd_ulong2   __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_ulong3   __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_ulong4   __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_ulong8   __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float2  SIMD_CFUNC simd_float(simd_double2  __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3  SIMD_CFUNC simd_float(simd_double3  __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4  SIMD_CFUNC simd_float(simd_double4  __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8  SIMD_CFUNC simd_float(simd_double8  __x) { return __builtin_convertvector(__x,simd_float8); }
+    
+
+static simd_long2  SIMD_CFUNC simd_long(simd_char2    __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_char3    __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_char4    __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_char8    __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_uchar2   __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_uchar3   __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_uchar4   __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_uchar8   __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_short2   __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_short3   __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_short4   __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_short8   __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_ushort2  __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_ushort3  __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_ushort4  __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_ushort8  __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_int2     __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_int3     __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_int4     __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_int8     __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_uint2    __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_uint3    __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_uint4    __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_uint8    __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_float2   __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_float3   __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_float4   __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_float8   __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2  SIMD_CFUNC simd_long(simd_long2    __x) { return __x; }
+static simd_long3  SIMD_CFUNC simd_long(simd_long3    __x) { return __x; }
+static simd_long4  SIMD_CFUNC simd_long(simd_long4    __x) { return __x; }
+static simd_long8  SIMD_CFUNC simd_long(simd_long8    __x) { return __x; }
+static simd_long2  SIMD_CFUNC simd_long(simd_ulong2   __x) { return (simd_long2)__x; }
+static simd_long3  SIMD_CFUNC simd_long(simd_ulong3   __x) { return (simd_long3)__x; }
+static simd_long4  SIMD_CFUNC simd_long(simd_ulong4   __x) { return (simd_long4)__x; }
+static simd_long8  SIMD_CFUNC simd_long(simd_ulong8   __x) { return (simd_long8)__x; }
+static simd_long2  SIMD_CFUNC simd_long(simd_double2  __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3  SIMD_CFUNC simd_long(simd_double3  __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4  SIMD_CFUNC simd_long(simd_double4  __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8  SIMD_CFUNC simd_long(simd_double8  __x) { return __builtin_convertvector(__x,simd_long8); }
+    
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_char2    __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_char3    __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_char4    __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_char8    __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_short2   __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_short3   __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_short4   __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_short8   __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_int2     __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_int3     __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_int4     __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_int8     __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_float2   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_float3   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_float4   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_float8   __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_long2    __x) { return __x; }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_long3    __x) { return __x; }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_long4    __x) { return __x; }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_long8    __x) { return __x; }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_double2  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_double3  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_double4  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_double8  __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uchar2   __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uchar3   __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uchar4   __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uchar8   __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ushort2  __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ushort3  __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ushort4  __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ushort8  __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_uint2    __x) { return simd_long(__x); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_uint3    __x) { return simd_long(__x); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_uint4    __x) { return simd_long(__x); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_uint8    __x) { return simd_long(__x); }
+static simd_long2  SIMD_CFUNC simd_long_sat(simd_ulong2   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long3  SIMD_CFUNC simd_long_sat(simd_ulong3   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long4  SIMD_CFUNC simd_long_sat(simd_ulong4   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long8  SIMD_CFUNC simd_long_sat(simd_ulong8   __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+    
+static simd_long2  SIMD_CFUNC simd_long_rte(simd_double2  __x) {
+#if defined __AVX512F__
+  return _mm_cvtpd_epi64(__x);
+#elif defined __arm64__
+  return vcvtnq_s64_f64(__x);
+#else
+  simd_double2 magic = __tg_copysign(0x1.0p52, __x);
+  simd_long2 x_is_small = __tg_fabs(__x) < 0x1.0p52;
+  return __builtin_convertvector(simd_bitselect(__x, (__x + magic) - magic, x_is_small & 0x7fffffffffffffff), simd_long2);
+#endif
+}
+
+static simd_long3  SIMD_CFUNC simd_long_rte(simd_double3  __x) {
+  return simd_make_long3(simd_long_rte(simd_make_double4_undef(__x)));
+}
+
+static simd_long4  SIMD_CFUNC simd_long_rte(simd_double4  __x) {
+#if defined __AVX512F__
+  return _mm256_cvtpd_epi64(__x);
+#else
+  return simd_make_long4(simd_long_rte(__x.lo), simd_long_rte(__x.hi));
+#endif
+}
+
+static simd_long8  SIMD_CFUNC simd_long_rte(simd_double8  __x) {
+#if defined __AVX512F__
+  return _mm512_cvt_roundpd_epi64(__x, _MM_FROUND_RINT);
+#else
+  return simd_make_long8(simd_long_rte(__x.lo), simd_long_rte(__x.hi));
+#endif
+}
+
+
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_char2    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_char3    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_char4    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_char8    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uchar2   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uchar3   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uchar4   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uchar8   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_short2   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_short3   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_short4   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_short8   __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ushort2  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ushort3  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ushort4  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ushort8  __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_int2     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_int3     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_int4     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_int8     __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_uint2    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_uint3    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_uint4    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_uint8    __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_float2   __x) { simd_int2 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float2)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong2)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_float3   __x) { simd_int3 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float3)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong3)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_float4   __x) { simd_int4 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float4)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong4)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_float8   __x) { simd_int8 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float8)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong8)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_long2    __x) { return (simd_ulong2)__x; }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_long3    __x) { return (simd_ulong3)__x; }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_long4    __x) { return (simd_ulong4)__x; }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_long8    __x) { return (simd_ulong8)__x; }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_ulong2   __x) { return __x; }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_ulong3   __x) { return __x; }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_ulong4   __x) { return __x; }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_ulong8   __x) { return __x; }
+static simd_ulong2  SIMD_CFUNC simd_ulong(simd_double2  __x) { simd_long2 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double2)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong2)0,0x8000000000000000,__big); }
+static simd_ulong3  SIMD_CFUNC simd_ulong(simd_double3  __x) { simd_long3 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double3)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong3)0,0x8000000000000000,__big); }
+static simd_ulong4  SIMD_CFUNC simd_ulong(simd_double4  __x) { simd_long4 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double4)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong4)0,0x8000000000000000,__big); }
+static simd_ulong8  SIMD_CFUNC simd_ulong(simd_double8  __x) { simd_long8 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double8)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong8)0,0x8000000000000000,__big); }
+    
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_char2    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_char3    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_char4    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_char8    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_short2   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_short3   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_short4   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_short8   __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_int2     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_int3     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_int4     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_int8     __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_float2   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_float3   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_float4   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_float8   __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_long2    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_long3    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_long4    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_long8    __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_double2  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_double3  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_double4  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_double8  __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uchar2   __x) { return simd_ulong(__x); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uchar3   __x) { return simd_ulong(__x); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uchar4   __x) { return simd_ulong(__x); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uchar8   __x) { return simd_ulong(__x); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ushort2  __x) { return simd_ulong(__x); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ushort3  __x) { return simd_ulong(__x); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ushort4  __x) { return simd_ulong(__x); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ushort8  __x) { return simd_ulong(__x); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_uint2    __x) { return simd_ulong(__x); }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_uint3    __x) { return simd_ulong(__x); }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_uint4    __x) { return simd_ulong(__x); }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_uint8    __x) { return simd_ulong(__x); }
+static simd_ulong2  SIMD_CFUNC simd_ulong_sat(simd_ulong2   __x) { return __x; }
+static simd_ulong3  SIMD_CFUNC simd_ulong_sat(simd_ulong3   __x) { return __x; }
+static simd_ulong4  SIMD_CFUNC simd_ulong_sat(simd_ulong4   __x) { return __x; }
+static simd_ulong8  SIMD_CFUNC simd_ulong_sat(simd_ulong8   __x) { return __x; }
+    
+
+static simd_double2  SIMD_CFUNC simd_double(simd_char2    __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_char3    __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_char4    __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_char8    __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_uchar2   __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_uchar3   __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_uchar4   __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_uchar8   __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_short2   __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_short3   __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_short4   __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_short8   __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_ushort2  __x) { return simd_double(simd_int(__x)); }
+static simd_double3  SIMD_CFUNC simd_double(simd_ushort3  __x) { return simd_double(simd_int(__x)); }
+static simd_double4  SIMD_CFUNC simd_double(simd_ushort4  __x) { return simd_double(simd_int(__x)); }
+static simd_double8  SIMD_CFUNC simd_double(simd_ushort8  __x) { return simd_double(simd_int(__x)); }
+static simd_double2  SIMD_CFUNC simd_double(simd_int2     __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_int3     __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_int4     __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_int8     __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_uint2    __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_uint3    __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_uint4    __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_uint8    __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_float2   __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_float3   __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_float4   __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_float8   __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_long2    __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_long3    __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_long4    __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_long8    __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_ulong2   __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_ulong3   __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_ulong4   __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_ulong8   __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2  SIMD_CFUNC simd_double(simd_double2  __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3  SIMD_CFUNC simd_double(simd_double3  __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4  SIMD_CFUNC simd_double(simd_double4  __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8  SIMD_CFUNC simd_double(simd_double8  __x) { return __builtin_convertvector(__x, simd_double8); }
+    
+
+#ifdef __cplusplus
+}
+#endif
+#endif // SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#endif // __SIMD_CONVERSION_HEADER__
+\ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/logic.h b/lib/libc/include/aarch64-macos-gnu/simd/logic.h
new file mode 100644
index 0000000000..fdefcb632d
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/logic.h
@@ -0,0 +1,1315 @@
+/*! @header
+ *  The interfaces declared in this header provide logical and bitwise
+ *  operations on vectors.  Some of these function operate elementwise,
+ *  and some produce a scalar result that depends on all lanes of the input.
+ *
+ *  For functions returning a boolean value, the return type in C and
+ *  Objective-C is _Bool; for C++ it is bool.
+ *
+ *      Function                    Result
+ *      ------------------------------------------------------------------
+ *      simd_all(comparison)        True if and only if the comparison is true
+ *                                  in every vector lane.  e.g.:
+ *
+ *                                      if (simd_all(x == 0.0f)) {
+ *                                          // executed if every lane of x
+ *                                          // contains zero.
+ *                                      }
+ *
+ *                                  The precise function of simd_all is to
+ *                                  return the high-order bit of the result
+ *                                  of a horizontal bitwise AND of all vector
+ *                                  lanes.
+ *
+ *      simd_any(comparison)        True if and only if the comparison is true
+ *                                  in at least one vector lane.  e.g.:
+ *
+ *                                      if (simd_any(x < 0.0f)) {
+ *                                          // executed if any lane of x
+ *                                          // contains a negative value.
+ *                                      }
+ *
+ *                                  The precise function of simd_all is to
+ *                                  return the high-order bit of the result
+ *                                  of a horizontal bitwise OR of all vector
+ *                                  lanes.
+ *
+ *      simd_select(x,y,mask)       For each lane in the result, selects the
+ *                                  corresponding element of x if the high-
+ *                                  order bit of the corresponding element of
+ *                                  mask is 0, and the corresponding element
+ *                                  of y otherwise.
+ *
+ *      simd_bitselect(x,y,mask)    For each bit in the result, selects the
+ *                                  corresponding bit of x if the corresponding
+ *                                  bit of mask is clear, and the corresponding
+ *                                  of y otherwise.
+ *
+ *  In C++, these functions are available under the simd:: namespace:
+ *
+ *      C++ Function                    Equivalent C Function
+ *      --------------------------------------------------------------------
+ *      simd::all(comparison)           simd_all(comparison)
+ *      simd::any(comparison)           simd_any(comparison)
+ *      simd::select(x,y,mask)          simd_select(x,y,mask)
+ *      simd::bitselect(x,y,mask)       simd_bitselect(x,y,mask)
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_LOGIC_HEADER
+#define SIMD_LOGIC_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char64 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar64 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ *  vector is set.
+ *  @discussion Deprecated. Use simd_any instead.                             */
+#define vector_any simd_any
+
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char64 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar64 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.                                                            */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ *  vector is set.
+ *  @discussion Deprecated. Use simd_all instead.                             */
+#define vector_all simd_all
+
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float2 simd_select(simd_float2 x, simd_float2 y, simd_int2 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float3 simd_select(simd_float3 x, simd_float3 y, simd_int3 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float4 simd_select(simd_float4 x, simd_float4 y, simd_int4 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float8 simd_select(simd_float8 x, simd_float8 y, simd_int8 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_float16 simd_select(simd_float16 x, simd_float16 y, simd_int16 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double2 simd_select(simd_double2 x, simd_double2 y, simd_long2 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double3 simd_select(simd_double3 x, simd_double3 y, simd_long3 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double4 simd_select(simd_double4 x, simd_double4 y, simd_long4 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.                                     */
+static inline SIMD_CFUNC simd_double8 simd_select(simd_double8 x, simd_double8 y, simd_long8 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ *  of x or y according to whether the high-order bit of the corresponding
+ *  lane of mask is 0 or 1, respectively.
+ *  @discussion Deprecated. Use simd_select instead.                          */
+#define vector_select simd_select
+  
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char2 simd_bitselect(simd_char2 x, simd_char2 y, simd_char2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char3 simd_bitselect(simd_char3 x, simd_char3 y, simd_char3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char4 simd_bitselect(simd_char4 x, simd_char4 y, simd_char4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char8 simd_bitselect(simd_char8 x, simd_char8 y, simd_char8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char16 simd_bitselect(simd_char16 x, simd_char16 y, simd_char16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char32 simd_bitselect(simd_char32 x, simd_char32 y, simd_char32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_char64 simd_bitselect(simd_char64 x, simd_char64 y, simd_char64 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar2 simd_bitselect(simd_uchar2 x, simd_uchar2 y, simd_char2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar3 simd_bitselect(simd_uchar3 x, simd_uchar3 y, simd_char3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar4 simd_bitselect(simd_uchar4 x, simd_uchar4 y, simd_char4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar8 simd_bitselect(simd_uchar8 x, simd_uchar8 y, simd_char8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar16 simd_bitselect(simd_uchar16 x, simd_uchar16 y, simd_char16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar32 simd_bitselect(simd_uchar32 x, simd_uchar32 y, simd_char32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uchar64 simd_bitselect(simd_uchar64 x, simd_uchar64 y, simd_char64 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short2 simd_bitselect(simd_short2 x, simd_short2 y, simd_short2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short3 simd_bitselect(simd_short3 x, simd_short3 y, simd_short3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short4 simd_bitselect(simd_short4 x, simd_short4 y, simd_short4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short8 simd_bitselect(simd_short8 x, simd_short8 y, simd_short8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short16 simd_bitselect(simd_short16 x, simd_short16 y, simd_short16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_short32 simd_bitselect(simd_short32 x, simd_short32 y, simd_short32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort2 simd_bitselect(simd_ushort2 x, simd_ushort2 y, simd_short2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort3 simd_bitselect(simd_ushort3 x, simd_ushort3 y, simd_short3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort4 simd_bitselect(simd_ushort4 x, simd_ushort4 y, simd_short4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort8 simd_bitselect(simd_ushort8 x, simd_ushort8 y, simd_short8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort16 simd_bitselect(simd_ushort16 x, simd_ushort16 y, simd_short16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ushort32 simd_bitselect(simd_ushort32 x, simd_ushort32 y, simd_short32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int2 simd_bitselect(simd_int2 x, simd_int2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int3 simd_bitselect(simd_int3 x, simd_int3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int4 simd_bitselect(simd_int4 x, simd_int4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int8 simd_bitselect(simd_int8 x, simd_int8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_int16 simd_bitselect(simd_int16 x, simd_int16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint2 simd_bitselect(simd_uint2 x, simd_uint2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint3 simd_bitselect(simd_uint3 x, simd_uint3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint4 simd_bitselect(simd_uint4 x, simd_uint4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint8 simd_bitselect(simd_uint8 x, simd_uint8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_uint16 simd_bitselect(simd_uint16 x, simd_uint16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float2 simd_bitselect(simd_float2 x, simd_float2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float3 simd_bitselect(simd_float3 x, simd_float3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float4 simd_bitselect(simd_float4 x, simd_float4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float8 simd_bitselect(simd_float8 x, simd_float8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_float16 simd_bitselect(simd_float16 x, simd_float16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long2 simd_bitselect(simd_long2 x, simd_long2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long3 simd_bitselect(simd_long3 x, simd_long3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long4 simd_bitselect(simd_long4 x, simd_long4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_long8 simd_bitselect(simd_long8 x, simd_long8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong2 simd_bitselect(simd_ulong2 x, simd_ulong2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong3 simd_bitselect(simd_ulong3 x, simd_ulong3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong4 simd_bitselect(simd_ulong4 x, simd_ulong4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_ulong8 simd_bitselect(simd_ulong8 x, simd_ulong8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double2 simd_bitselect(simd_double2 x, simd_double2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double3 simd_bitselect(simd_double3 x, simd_double3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double4 simd_bitselect(simd_double4 x, simd_double4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.                                                             */
+static inline SIMD_CFUNC simd_double8 simd_bitselect(simd_double8 x, simd_double8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ *  or y according to whether the corresponding bit of mask is 0 or 1,
+ *  respectively.
+ *  @discussion Deprecated. Use simd_bitselect instead.                       */
+#define vector_bitselect simd_bitselect
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+  /*! @abstract True if and only if the high-order bit of every lane is set.  */
+  template <typename inttypeN> static SIMD_CPPFUNC simd_bool all(const inttypeN predicate) { return ::simd_all(predicate); }
+  /*! @abstract True if and only if the high-order bit of any lane is set.    */
+  template <typename inttypeN> static SIMD_CPPFUNC simd_bool any(const inttypeN predicate) { return ::simd_any(predicate); }
+  /*! @abstract Each lane of the result is selected from the corresponding lane
+   *  of x or y according to whether the high-order bit of the corresponding
+   *  lane of mask is 0 or 1, respectively.                                   */
+  template <typename inttypeN, typename fptypeN> static SIMD_CPPFUNC fptypeN select(const fptypeN x, const fptypeN y, const inttypeN predicate) { return ::simd_select(x,y,predicate); }
+  /*! @abstract For each bit in the result, selects the corresponding bit of x
+   *  or y according to whether the corresponding bit of mask is 0 or 1,
+   *  respectively.                                                           */
+  template <typename inttypeN, typename typeN> static SIMD_CPPFUNC typeN bitselect(const typeN x, const typeN y, const inttypeN mask) { return ::simd_bitselect(x,y,mask); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementations
+
+static inline SIMD_CFUNC simd_bool simd_any(simd_char2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x3);
+#elif defined __arm64__
+  return simd_any(x.xyxy);
+#else
+  union { uint16_t i; simd_char2 v; } u = { .v = x };
+  return (u.i & 0x8080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x7);
+#elif defined __arm64__
+  return simd_any(x.xyzz);
+#else
+  union { uint32_t i; simd_char3 v; } u = { .v = x };
+  return (u.i & 0x808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xf);
+#elif defined __arm64__
+  return simd_any(x.xyzwxyzw);
+#else
+  union { uint32_t i; simd_char4 v; } u = { .v = x };
+  return (u.i & 0x80808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xff);
+#elif defined __arm64__
+  return vmaxv_u8(x) & 0x80;
+#else
+  union { uint64_t i; simd_char8 v; } u = { .v = x };
+  return (u.i & 0x8080808080808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char16 x) {
+#if defined __SSE2__
+  return _mm_movemask_epi8((__m128i)x);
+#elif defined __arm64__
+  return vmaxvq_u8(x) & 0x80;
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char32 x) {
+#if defined __AVX2__
+  return _mm256_movemask_epi8(x);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char64 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar2 x) {
+  return simd_any((simd_char2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar3 x) {
+  return simd_any((simd_char3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar4 x) {
+  return simd_any((simd_char4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar8 x) {
+  return simd_any((simd_char8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar16 x) {
+  return simd_any((simd_char16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar32 x) {
+  return simd_any((simd_char32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar64 x) {
+  return simd_any((simd_char64)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xa);
+#elif defined __arm64__
+  return simd_any(x.xyxy);
+#else
+  union { uint32_t i; simd_short2 v; } u = { .v = x };
+  return (u.i & 0x80008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0x2a);
+#elif defined __arm64__
+  return simd_any(x.xyzz);
+#else
+  union { uint64_t i; simd_short3 v; } u = { .v = x };
+  return (u.i & 0x800080008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xaa);
+#elif defined __arm64__
+  return vmaxv_u16(x) & 0x8000;
+#else
+  union { uint64_t i; simd_short4 v; } u = { .v = x };
+  return (u.i & 0x8000800080008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)x) & 0xaaaa);
+#elif defined __arm64__
+  return vmaxvq_u16(x) & 0x8000;
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short16 x) {
+#if defined __AVX2__
+  return (_mm256_movemask_epi8(x) & 0xaaaaaaaa);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short32 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort2 x) {
+  return simd_any((simd_short2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort3 x) {
+  return simd_any((simd_short3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort4 x) {
+  return simd_any((simd_short4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort8 x) {
+  return simd_any((simd_short8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort16 x) {
+  return simd_any((simd_short16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort32 x) {
+  return simd_any((simd_short32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x3);
+#elif defined __arm64__
+  return vmaxv_u32(x) & 0x80000000;
+#else
+  union { uint64_t i; simd_int2 v; } u = { .v = x };
+  return (u.i & 0x8000000080000000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x7);
+#elif defined __arm64__
+  return simd_any(x.xyzz);
+#else
+  return (x.x | x.y | x.z) & 0x80000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int4 x) {
+#if defined __SSE2__
+  return _mm_movemask_ps((__m128)x);
+#elif defined __arm64__
+  return vmaxvq_u32(x) & 0x80000000;
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int8 x) {
+#if defined __AVX__
+  return _mm256_movemask_ps(x);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int16 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint2 x) {
+  return simd_any((simd_int2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint3 x) {
+  return simd_any((simd_int3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint4 x) {
+  return simd_any((simd_int4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint8 x) {
+  return simd_any((simd_int8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint16 x) {
+  return simd_any((simd_int16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long2 x) {
+#if defined __SSE2__
+  return _mm_movemask_pd((__m128d)x);
+#elif defined __arm64__
+  return (x.x | x.y) & 0x8000000000000000U;
+#else
+  return (x.x | x.y) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long3 x) {
+#if defined __AVX__
+  return (_mm256_movemask_pd(simd_make_long4_undef(x)) & 0x7);
+#else
+  return (x.x | x.y | x.z) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long4 x) {
+#if defined __AVX__
+  return _mm256_movemask_pd(x);
+#else
+  return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long8 x) {
+  return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong2 x) {
+  return simd_any((simd_long2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong3 x) {
+  return simd_any((simd_long3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong4 x) {
+  return simd_any((simd_long4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong8 x) {
+  return simd_any((simd_long8)x);
+}
+  
+static inline SIMD_CFUNC simd_bool simd_all(simd_char2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x3) == 0x3;
+#elif defined __arm64__
+  return simd_all(x.xyxy);
+#else
+  union { uint16_t i; simd_char2 v; } u = { .v = x };
+  return (u.i & 0x8080) == 0x8080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x7) == 0x7;
+#elif defined __arm64__
+  return simd_all(x.xyzz);
+#else
+  union { uint32_t i; simd_char3 v; } u = { .v = x };
+  return (u.i & 0x808080) == 0x808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xf) == 0xf;
+#elif defined __arm64__
+  return simd_all(x.xyzwxyzw);
+#else
+  union { uint32_t i; simd_char4 v; } u = { .v = x };
+  return (u.i & 0x80808080) == 0x80808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xff) == 0xff;
+#elif defined __arm64__
+  return vminv_u8(x) & 0x80;
+#else
+  union { uint64_t i; simd_char8 v; } u = { .v = x };
+  return (u.i & 0x8080808080808080) == 0x8080808080808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char16 x) {
+#if defined __SSE2__
+  return _mm_movemask_epi8((__m128i)x) == 0xffff;
+#elif defined __arm64__
+  return vminvq_u8(x) & 0x80;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char32 x) {
+#if defined __AVX2__
+  return _mm256_movemask_epi8(x) == 0xffffffff;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char64 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar2 x) {
+  return simd_all((simd_char2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar3 x) {
+  return simd_all((simd_char3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar4 x) {
+  return simd_all((simd_char4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar8 x) {
+  return simd_all((simd_char8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar16 x) {
+  return simd_all((simd_char16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar32 x) {
+  return simd_all((simd_char32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar64 x) {
+  return simd_all((simd_char64)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xa) == 0xa;
+#elif defined __arm64__
+  return simd_all(x.xyxy);
+#else
+  union { uint32_t i; simd_short2 v; } u = { .v = x };
+  return (u.i & 0x80008000) == 0x80008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0x2a) == 0x2a;
+#elif defined __arm64__
+  return simd_all(x.xyzz);
+#else
+  union { uint64_t i; simd_short3 v; } u = { .v = x };
+  return (u.i & 0x800080008000) == 0x800080008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short4 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xaa) == 0xaa;
+#elif defined __arm64__
+  return vminv_u16(x) & 0x8000;
+#else
+  union { uint64_t i; simd_short4 v; } u = { .v = x };
+  return (u.i & 0x8000800080008000) == 0x8000800080008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short8 x) {
+#if defined __SSE2__
+  return (_mm_movemask_epi8((__m128i)x) & 0xaaaa) == 0xaaaa;
+#elif defined __arm64__
+  return vminvq_u16(x) & 0x8000;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short16 x) {
+#if defined __AVX2__
+  return (_mm256_movemask_epi8(x) & 0xaaaaaaaa) == 0xaaaaaaaa;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short32 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort2 x) {
+  return simd_all((simd_short2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort3 x) {
+  return simd_all((simd_short3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort4 x) {
+  return simd_all((simd_short4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort8 x) {
+  return simd_all((simd_short8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort16 x) {
+  return simd_all((simd_short16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort32 x) {
+  return simd_all((simd_short32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int2 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x3) == 0x3;
+#elif defined __arm64__
+  return vminv_u32(x) & 0x80000000;
+#else
+  union { uint64_t i; simd_int2 v; } u = { .v = x };
+  return (u.i & 0x8000000080000000) == 0x8000000080000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int3 x) {
+#if defined __SSE2__
+  return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x7) == 0x7;
+#elif defined __arm64__
+  return simd_all(x.xyzz);
+#else
+  return (x.x & x.y & x.z) & 0x80000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int4 x) {
+#if defined __SSE2__
+  return _mm_movemask_ps((__m128)x) == 0xf;
+#elif defined __arm64__
+  return vminvq_u32(x) & 0x80000000;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int8 x) {
+#if defined __AVX__
+  return _mm256_movemask_ps(x) == 0xff;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int16 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint2 x) {
+  return simd_all((simd_int2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint3 x) {
+  return simd_all((simd_int3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint4 x) {
+  return simd_all((simd_int4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint8 x) {
+  return simd_all((simd_int8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint16 x) {
+  return simd_all((simd_int16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long2 x) {
+#if defined __SSE2__
+  return _mm_movemask_pd((__m128d)x) == 0x3;
+#elif defined __arm64__
+  return (x.x & x.y) & 0x8000000000000000U;
+#else
+  return (x.x & x.y) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long3 x) {
+#if defined __AVX__
+  return (_mm256_movemask_pd(simd_make_long4_undef(x)) & 0x7) == 0x7;
+#else
+  return (x.x & x.y & x.z) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long4 x) {
+#if defined __AVX__
+  return _mm256_movemask_pd(x) == 0xf;
+#else
+  return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long8 x) {
+  return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong2 x) {
+  return simd_all((simd_long2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong3 x) {
+  return simd_all((simd_long3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong4 x) {
+  return simd_all((simd_long4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong8 x) {
+  return simd_all((simd_long8)x);
+}
+  
+static inline SIMD_CFUNC simd_float2 simd_select(simd_float2 x, simd_float2 y, simd_int2 mask) {
+  return simd_make_float2(simd_select(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_int4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_float3 simd_select(simd_float3 x, simd_float3 y, simd_int3 mask) {
+  return simd_make_float3(simd_select(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_int4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_float4 simd_select(simd_float4 x, simd_float4 y, simd_int4 mask) {
+#if defined __SSE4_1__
+  return _mm_blendv_ps(x, y, (__m128)mask);
+#else
+  return simd_bitselect(x, y, mask >> 31);
+#endif
+}
+static inline SIMD_CFUNC simd_float8 simd_select(simd_float8 x, simd_float8 y, simd_int8 mask) {
+#if defined __AVX__
+  return _mm256_blendv_ps(x, y, mask);
+#else
+  return simd_bitselect(x, y, mask >> 31);
+#endif
+}
+static inline SIMD_CFUNC simd_float16 simd_select(simd_float16 x, simd_float16 y, simd_int16 mask) {
+  return simd_bitselect(x, y, mask >> 31);
+}
+static inline SIMD_CFUNC simd_double2 simd_select(simd_double2 x, simd_double2 y, simd_long2 mask) {
+#if defined __SSE4_1__
+  return _mm_blendv_pd(x, y, (__m128d)mask);
+#else
+  return simd_bitselect(x, y, mask >> 63);
+#endif
+}
+static inline SIMD_CFUNC simd_double3 simd_select(simd_double3 x, simd_double3 y, simd_long3 mask) {
+  return simd_make_double3(simd_select(simd_make_double4_undef(x), simd_make_double4_undef(y), simd_make_long4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_double4 simd_select(simd_double4 x, simd_double4 y, simd_long4 mask) {
+#if defined __AVX__
+  return _mm256_blendv_pd(x, y, mask);
+#else
+  return simd_bitselect(x, y, mask >> 63);
+#endif
+}
+static inline SIMD_CFUNC simd_double8 simd_select(simd_double8 x, simd_double8 y, simd_long8 mask) {
+  return simd_bitselect(x, y, mask >> 63);
+}
+  
+static inline SIMD_CFUNC simd_char2 simd_bitselect(simd_char2 x, simd_char2 y, simd_char2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char3 simd_bitselect(simd_char3 x, simd_char3 y, simd_char3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char4 simd_bitselect(simd_char4 x, simd_char4 y, simd_char4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char8 simd_bitselect(simd_char8 x, simd_char8 y, simd_char8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char16 simd_bitselect(simd_char16 x, simd_char16 y, simd_char16 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char32 simd_bitselect(simd_char32 x, simd_char32 y, simd_char32 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char64 simd_bitselect(simd_char64 x, simd_char64 y, simd_char64 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_uchar2 simd_bitselect(simd_uchar2 x, simd_uchar2 y, simd_char2 mask) {
+  return (simd_uchar2)simd_bitselect((simd_char2)x, (simd_char2)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar3 simd_bitselect(simd_uchar3 x, simd_uchar3 y, simd_char3 mask) {
+  return (simd_uchar3)simd_bitselect((simd_char3)x, (simd_char3)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar4 simd_bitselect(simd_uchar4 x, simd_uchar4 y, simd_char4 mask) {
+  return (simd_uchar4)simd_bitselect((simd_char4)x, (simd_char4)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar8 simd_bitselect(simd_uchar8 x, simd_uchar8 y, simd_char8 mask) {
+  return (simd_uchar8)simd_bitselect((simd_char8)x, (simd_char8)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar16 simd_bitselect(simd_uchar16 x, simd_uchar16 y, simd_char16 mask) {
+  return (simd_uchar16)simd_bitselect((simd_char16)x, (simd_char16)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar32 simd_bitselect(simd_uchar32 x, simd_uchar32 y, simd_char32 mask) {
+  return (simd_uchar32)simd_bitselect((simd_char32)x, (simd_char32)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar64 simd_bitselect(simd_uchar64 x, simd_uchar64 y, simd_char64 mask) {
+  return (simd_uchar64)simd_bitselect((simd_char64)x, (simd_char64)y, mask);
+}
+static inline SIMD_CFUNC simd_short2 simd_bitselect(simd_short2 x, simd_short2 y, simd_short2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short3 simd_bitselect(simd_short3 x, simd_short3 y, simd_short3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short4 simd_bitselect(simd_short4 x, simd_short4 y, simd_short4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short8 simd_bitselect(simd_short8 x, simd_short8 y, simd_short8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short16 simd_bitselect(simd_short16 x, simd_short16 y, simd_short16 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short32 simd_bitselect(simd_short32 x, simd_short32 y, simd_short32 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_ushort2 simd_bitselect(simd_ushort2 x, simd_ushort2 y, simd_short2 mask) {
+  return (simd_ushort2)simd_bitselect((simd_short2)x, (simd_short2)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort3 simd_bitselect(simd_ushort3 x, simd_ushort3 y, simd_short3 mask) {
+  return (simd_ushort3)simd_bitselect((simd_short3)x, (simd_short3)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort4 simd_bitselect(simd_ushort4 x, simd_ushort4 y, simd_short4 mask) {
+  return (simd_ushort4)simd_bitselect((simd_short4)x, (simd_short4)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort8 simd_bitselect(simd_ushort8 x, simd_ushort8 y, simd_short8 mask) {
+  return (simd_ushort8)simd_bitselect((simd_short8)x, (simd_short8)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort16 simd_bitselect(simd_ushort16 x, simd_ushort16 y, simd_short16 mask) {
+  return (simd_ushort16)simd_bitselect((simd_short16)x, (simd_short16)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort32 simd_bitselect(simd_ushort32 x, simd_ushort32 y, simd_short32 mask) {
+  return (simd_ushort32)simd_bitselect((simd_short32)x, (simd_short32)y, mask);
+}
+static inline SIMD_CFUNC simd_int2 simd_bitselect(simd_int2 x, simd_int2 y, simd_int2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int3 simd_bitselect(simd_int3 x, simd_int3 y, simd_int3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int4 simd_bitselect(simd_int4 x, simd_int4 y, simd_int4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int8 simd_bitselect(simd_int8 x, simd_int8 y, simd_int8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int16 simd_bitselect(simd_int16 x, simd_int16 y, simd_int16 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_uint2 simd_bitselect(simd_uint2 x, simd_uint2 y, simd_int2 mask) {
+  return (simd_uint2)simd_bitselect((simd_int2)x, (simd_int2)y, mask);
+}
+static inline SIMD_CFUNC simd_uint3 simd_bitselect(simd_uint3 x, simd_uint3 y, simd_int3 mask) {
+  return (simd_uint3)simd_bitselect((simd_int3)x, (simd_int3)y, mask);
+}
+static inline SIMD_CFUNC simd_uint4 simd_bitselect(simd_uint4 x, simd_uint4 y, simd_int4 mask) {
+  return (simd_uint4)simd_bitselect((simd_int4)x, (simd_int4)y, mask);
+}
+static inline SIMD_CFUNC simd_uint8 simd_bitselect(simd_uint8 x, simd_uint8 y, simd_int8 mask) {
+  return (simd_uint8)simd_bitselect((simd_int8)x, (simd_int8)y, mask);
+}
+static inline SIMD_CFUNC simd_uint16 simd_bitselect(simd_uint16 x, simd_uint16 y, simd_int16 mask) {
+  return (simd_uint16)simd_bitselect((simd_int16)x, (simd_int16)y, mask);
+}
+static inline SIMD_CFUNC simd_float2 simd_bitselect(simd_float2 x, simd_float2 y, simd_int2 mask) {
+  return (simd_float2)simd_bitselect((simd_int2)x, (simd_int2)y, mask);
+}
+static inline SIMD_CFUNC simd_float3 simd_bitselect(simd_float3 x, simd_float3 y, simd_int3 mask) {
+  return (simd_float3)simd_bitselect((simd_int3)x, (simd_int3)y, mask);
+}
+static inline SIMD_CFUNC simd_float4 simd_bitselect(simd_float4 x, simd_float4 y, simd_int4 mask) {
+  return (simd_float4)simd_bitselect((simd_int4)x, (simd_int4)y, mask);
+}
+static inline SIMD_CFUNC simd_float8 simd_bitselect(simd_float8 x, simd_float8 y, simd_int8 mask) {
+  return (simd_float8)simd_bitselect((simd_int8)x, (simd_int8)y, mask);
+}
+static inline SIMD_CFUNC simd_float16 simd_bitselect(simd_float16 x, simd_float16 y, simd_int16 mask) {
+  return (simd_float16)simd_bitselect((simd_int16)x, (simd_int16)y, mask);
+}
+static inline SIMD_CFUNC simd_long2 simd_bitselect(simd_long2 x, simd_long2 y, simd_long2 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long3 simd_bitselect(simd_long3 x, simd_long3 y, simd_long3 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long4 simd_bitselect(simd_long4 x, simd_long4 y, simd_long4 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long8 simd_bitselect(simd_long8 x, simd_long8 y, simd_long8 mask) {
+  return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_ulong2 simd_bitselect(simd_ulong2 x, simd_ulong2 y, simd_long2 mask) {
+  return (simd_ulong2)simd_bitselect((simd_long2)x, (simd_long2)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong3 simd_bitselect(simd_ulong3 x, simd_ulong3 y, simd_long3 mask) {
+  return (simd_ulong3)simd_bitselect((simd_long3)x, (simd_long3)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong4 simd_bitselect(simd_ulong4 x, simd_ulong4 y, simd_long4 mask) {
+  return (simd_ulong4)simd_bitselect((simd_long4)x, (simd_long4)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong8 simd_bitselect(simd_ulong8 x, simd_ulong8 y, simd_long8 mask) {
+  return (simd_ulong8)simd_bitselect((simd_long8)x, (simd_long8)y, mask);
+}
+static inline SIMD_CFUNC simd_double2 simd_bitselect(simd_double2 x, simd_double2 y, simd_long2 mask) {
+  return (simd_double2)simd_bitselect((simd_long2)x, (simd_long2)y, mask);
+}
+static inline SIMD_CFUNC simd_double3 simd_bitselect(simd_double3 x, simd_double3 y, simd_long3 mask) {
+  return (simd_double3)simd_bitselect((simd_long3)x, (simd_long3)y, mask);
+}
+static inline SIMD_CFUNC simd_double4 simd_bitselect(simd_double4 x, simd_double4 y, simd_long4 mask) {
+  return (simd_double4)simd_bitselect((simd_long4)x, (simd_long4)y, mask);
+}
+static inline SIMD_CFUNC simd_double8 simd_bitselect(simd_double8 x, simd_double8 y, simd_long8 mask) {
+  return (simd_double8)simd_bitselect((simd_long8)x, (simd_long8)y, mask);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* __SIMD_LOGIC_HEADER__ */
+\ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/math.h b/lib/libc/include/aarch64-macos-gnu/simd/math.h
new file mode 100644
index 0000000000..4d5c654f69
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/math.h
@@ -0,0 +1,5380 @@
+/*! @header
+ *  The interfaces declared in this header provide elementwise math operations
+ *  on vectors; each lane of the result vector depends only on the data in the
+ *  corresponding lane of the argument(s) to the function.
+ *
+ *  You should not use the C functions declared in this header directly (these
+ *  are functions with names like `__tg_cos(x)`). These are merely
+ *  implementation details of <tgmath.h> overloading; instead of calling
+ *  `__tg_cos(x)`, call `cos(x)`. If you are writing C++, use `simd::cos(x)`.
+ *
+ *  Note that while these vector functions are relatively recent additions,
+ *  scalar fallback is provided for all of them, so they are available even
+ *  when targeting older OS versions.
+ *
+ *  The following functions are available:
+ *
+ *    C name        C++ name          Notes
+ *    ----------------------------------------------------------------------
+ *    acos(x)       simd::acos(x)     
+ *    asin(x)       simd::asin(x)
+ *    atan(x)       simd::atan(x)
+ *    atan2(y,x)    simd::atan2(y,x)  The argument order matches the scalar
+ *                                    atan2 function, which gives the angle
+ *                                    of a line with slope y/x.
+ *    cos(x)        simd::cos(x)
+ *    sin(x)        simd::sin(x)
+ *    tan(x)        simd::tan(x)
+ *    
+ *    cospi(x)      simd::cospi(x)    Returns cos(pi*x), sin(pi*x), tan(pi*x)
+ *    sinpi(x)      simd::sinpi(x)    more efficiently and accurately than
+ *    tanpi(x)      simd::tanpi(x)    would otherwise be possible
+ *
+ *    acosh(x)      simd::acosh(x)
+ *    asinh(x)      simd::asinh(x)
+ *    atanh(x)      simd::atanh(x)
+ *
+ *    cosh(x)       simd::cosh(x)
+ *    sinh(x)       simd::sinh(x)
+ *    tanh(x)       simd::tanh(x)
+ *
+ *    exp(x)        simd::exp(x)
+ *    exp2(x)       simd::exp2(x)
+ *    exp10(x)      simd::exp10(x)    More efficient that pow(10,x).
+ *    expm1(x)      simd::expm1(x)    exp(x)-1, accurate even for tiny x.
+ *
+ *    log(x)        simd::log(x)
+ *    log2(x)       simd::log2(x)
+ *    log10(x)      simd::log10(x)
+ *    log1p(x)      simd::log1p(x)    log(1+x), accurate even for tiny x.
+ *
+ *    fabs(x)       simd::fabs(x)
+ *    cbrt(x)       simd::cbrt(x)
+ *    sqrt(x)       simd::sqrt(x)
+ *    pow(x,y)      simd::pow(x,y)
+ *    copysign(x,y) simd::copysign(x,y)
+ *    hypot(x,y)    simd::hypot(x,y)  sqrt(x*x + y*y), computed without
+ *                                    overflow.1
+ *    erf(x)        simd::erf(x)
+ *    erfc(x)       simd::erfc(x)
+ *    tgamma(x)     simd::tgamma(x)
+ *
+ *    fmod(x,y)      simd::fmod(x,y)
+ *    remainder(x,y) simd::remainder(x,y)
+ *
+ *    ceil(x)       simd::ceil(x)
+ *    floor(x)      simd::floor(x)
+ *    rint(x)       simd::rint(x)
+ *    round(x)      simd::round(x)
+ *    trunc(x)      simd::trunc(x)
+ *
+ *    fdim(x,y)     simd::fdim(x,y)
+ *    fmax(x,y)     simd::fmax(x,y)   When one argument to fmin or fmax is
+ *    fmin(x,y)     simd::fmin(x,y)   constant, use it as the *second* (y)
+ *                                    argument to get better codegen on some
+ *                                    architectures. E.g., write fmin(x,2)
+ *                                    instead of fmin(2,x).
+ *    fma(x,y,z)    simd::fma(x,y,z)  Fast on arm64 and when targeting AVX2
+ *                                    and later; may be quite expensive on
+ *                                    older hardware.
+ *    simd_muladd(x,y,z) simd::muladd(x,y,z)
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_MATH_HEADER
+#define SIMD_MATH_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_acos(simd_float2 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_acos(simd_float3 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_acos(simd_double3 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ *  Objective-C, and `simd::acos` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_asin(simd_float2 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_asin(simd_float3 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_asin(simd_double3 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ *  Objective-C, and `simd::asin` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_atan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_atan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_atan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ *  Objective-C, and `simd::atan` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_cos(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_cos(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_cos(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ *  Objective-C, and `simd::cos` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_sin(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_sin(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_sin(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ *  Objective-C, and `simd::sin` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_tan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_tan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_tan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ *  Objective-C, and `simd::tan` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_cospi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_cospi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_cospi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ *  Objective-C, and `simd::cospi` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x);
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_sinpi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_sinpi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_sinpi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ *  Objective-C, and `simd::sinpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x);
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_tanpi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_tanpi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_tanpi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ *  Objective-C, and `simd::tanpi` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x);
+#endif
+
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_acosh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_acosh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_acosh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ *  Objective-C, and `simd::acosh` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_asinh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_asinh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_asinh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ *  Objective-C, and `simd::asinh` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_atanh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_atanh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_atanh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ *  Objective-C, and `simd::atanh` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_cosh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_cosh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_cosh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ *  Objective-C, and `simd::cosh` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_sinh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_sinh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_sinh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ *  Objective-C, and `simd::sinh` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_tanh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_tanh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_tanh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ *  Objective-C, and `simd::tanh` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_exp(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_exp(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_exp(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ *  Objective-C, and `simd::exp` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_exp2(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_exp2(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_exp2(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ *  Objective-C, and `simd::exp2` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_exp10(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_exp10(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_exp10(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ *  Objective-C, and `simd::exp10` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x);
+#endif
+
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_expm1(simd_float2 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_expm1(simd_float3 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_expm1(simd_double3 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ *  Objective-C, and `simd::expm1` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_log(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_log(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_log(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ *  Objective-C, and `simd::log` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_log2(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_log2(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_log2(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ *  Objective-C, and `simd::log2` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_log10(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_log10(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_log10(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ *  Objective-C, and `simd::log10` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_log1p(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_log1p(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_log1p(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ *  Objective-C, and `simd::log1p` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fabs(simd_float2 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fabs(simd_float3 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fabs(simd_float4 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fabs(simd_float8 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fabs(simd_float16 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fabs(simd_double2 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fabs(simd_double3 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fabs(simd_double4 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ *  Objective-C, and `simd::fabs` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fabs(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_cbrt(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_cbrt(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_cbrt(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ *  Objective-C, and `simd::cbrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_sqrt(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_sqrt(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_sqrt(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_sqrt(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_sqrt(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_sqrt(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_sqrt(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_sqrt(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ *  Objective-C, and `simd::sqrt` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_sqrt(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_erf(simd_float2 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_erf(simd_float3 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_erf(simd_double3 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ *  Objective-C, and `simd::erf` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_erfc(simd_float2 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_erfc(simd_float3 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_erfc(simd_double3 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ *  Objective-C, and `simd::erfc` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float2 __tg_tgamma(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float3 __tg_tgamma(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double3 __tg_tgamma(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ *  Objective-C, and `simd::tgamma` in C++.                                   */
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_ceil(simd_float2 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_ceil(simd_float3 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_ceil(simd_float4 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_ceil(simd_float8 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_ceil(simd_float16 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_ceil(simd_double2 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_ceil(simd_double3 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_ceil(simd_double4 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ *  Objective-C, and `simd::ceil` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_ceil(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_floor(simd_float2 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_floor(simd_float3 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_floor(simd_float4 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_floor(simd_float8 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_floor(simd_float16 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_floor(simd_double2 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_floor(simd_double3 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_floor(simd_double4 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ *  Objective-C, and `simd::floor` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_floor(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_rint(simd_float2 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_rint(simd_float3 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_rint(simd_float4 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_rint(simd_float8 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_rint(simd_float16 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_rint(simd_double2 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_rint(simd_double3 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_rint(simd_double4 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ *  Objective-C, and `simd::rint` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_rint(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_round(simd_float2 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_round(simd_float3 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_round(simd_double3 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ *  Objective-C, and `simd::round` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_trunc(simd_float2 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_trunc(simd_float3 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_trunc(simd_float4 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_trunc(simd_float8 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_trunc(simd_float16 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_trunc(simd_double2 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_trunc(simd_double3 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_trunc(simd_double4 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ *  Objective-C, and `simd::trunc` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_trunc(simd_double8 x);
+
+    
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_atan2(simd_float2 y, simd_float2 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_atan2(simd_float3 y, simd_float3 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_atan2(simd_double3 y, simd_double3 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ *  Objective-C, and `simd::atan2` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float2 __tg_hypot(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float3 __tg_hypot(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double3 __tg_hypot(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ *  Objective-C, and `simd::hypot` in C++.                                    */
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float2 __tg_pow(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float3 __tg_pow(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double3 __tg_pow(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ *  Objective-C, and `simd::pow` in C++.                                      */
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fmod(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fmod(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fmod(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ *  Objective-C, and `simd::fmod` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float2 __tg_remainder(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float3 __tg_remainder(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double3 __tg_remainder(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ *  Objective-C, and `simd::remainder` in C++.                                */
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float2 __tg_copysign(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float3 __tg_copysign(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float4 __tg_copysign(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float8 __tg_copysign(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_float16 __tg_copysign(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double2 __tg_copysign(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double3 __tg_copysign(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double4 __tg_copysign(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ *  Objective-C, and `simd::copysign` in C++.                                 */
+static inline SIMD_CFUNC simd_double8 __tg_copysign(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float2 __tg_nextafter(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float3 __tg_nextafter(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double3 __tg_nextafter(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ *  Objective-C, and `simd::nextafter` in C++.                                */
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fdim(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fdim(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fdim(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fdim(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fdim(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fdim(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fdim(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fdim(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ *  Objective-C, and `simd::fdim` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fdim(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fmax(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fmax(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fmax(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fmax(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fmax(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fmax(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fmax(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fmax(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ *  Objective-C, and `simd::fmax` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fmax(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float2 __tg_fmin(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float3 __tg_fmin(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float4 __tg_fmin(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float8 __tg_fmin(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_float16 __tg_fmin(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double2 __tg_fmin(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double3 __tg_fmin(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double4 __tg_fmin(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ *  Objective-C, and `simd::fmin` in C++.                                     */
+static inline SIMD_CFUNC simd_double8 __tg_fmin(simd_double8 x, simd_double8 y);
+
+
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float2 __tg_fma(simd_float2 x, simd_float2 y, simd_float2 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float3 __tg_fma(simd_float3 x, simd_float3 y, simd_float3 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float4 __tg_fma(simd_float4 x, simd_float4 y, simd_float4 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float8 __tg_fma(simd_float8 x, simd_float8 y, simd_float8 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_float16 __tg_fma(simd_float16 x, simd_float16 y, simd_float16 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double2 __tg_fma(simd_double2 x, simd_double2 y, simd_double2 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double3 __tg_fma(simd_double3 x, simd_double3 y, simd_double3 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double4 __tg_fma(simd_double4 x, simd_double4 y, simd_double4 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ *  and `simd::fma` in C++.                                                   */
+static inline SIMD_CFUNC simd_double8 __tg_fma(simd_double8 x, simd_double8 y, simd_double8 z);
+    
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC float simd_muladd(float x, float y, float z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float2 simd_muladd(simd_float2 x, simd_float2 y, simd_float2 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float3 simd_muladd(simd_float3 x, simd_float3 y, simd_float3 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float4 simd_muladd(simd_float4 x, simd_float4 y, simd_float4 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float8 simd_muladd(simd_float8 x, simd_float8 y, simd_float8 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_float16 simd_muladd(simd_float16 x, simd_float16 y, simd_float16 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC double simd_muladd(double x, double y, double z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double2 simd_muladd(simd_double2 x, simd_double2 y, simd_double2 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double3 simd_muladd(simd_double3 x, simd_double3 y, simd_double3 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double4 simd_muladd(simd_double4 x, simd_double4 y, simd_double4 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ *  either a fused multiply add or separate multiply and add instructions.    */
+static inline SIMD_CFUNC simd_double8 simd_muladd(simd_double8 x, simd_double8 y, simd_double8 z);
+    
+#ifdef __cplusplus
+} /* extern "C" */
+
+#include <cmath>
+/*! @abstract Do not call this function directly; use simd::acos instead.     */
+static SIMD_CPPFUNC float __tg_acos(float x) { return ::acos(x); }
+/*! @abstract Do not call this function directly; use simd::acos instead.     */
+static SIMD_CPPFUNC double __tg_acos(double x) { return ::acos(x); }
+/*! @abstract Do not call this function directly; use simd::asin instead.     */
+static SIMD_CPPFUNC float __tg_asin(float x) { return ::asin(x); }
+/*! @abstract Do not call this function directly; use simd::asin instead.     */
+static SIMD_CPPFUNC double __tg_asin(double x) { return ::asin(x); }
+/*! @abstract Do not call this function directly; use simd::atan instead.     */
+static SIMD_CPPFUNC float __tg_atan(float x) { return ::atan(x); }
+/*! @abstract Do not call this function directly; use simd::atan instead.     */
+static SIMD_CPPFUNC double __tg_atan(double x) { return ::atan(x); }
+/*! @abstract Do not call this function directly; use simd::cos instead.      */
+static SIMD_CPPFUNC float __tg_cos(float x) { return ::cos(x); }
+/*! @abstract Do not call this function directly; use simd::cos instead.      */
+static SIMD_CPPFUNC double __tg_cos(double x) { return ::cos(x); }
+/*! @abstract Do not call this function directly; use simd::sin instead.      */
+static SIMD_CPPFUNC float __tg_sin(float x) { return ::sin(x); }
+/*! @abstract Do not call this function directly; use simd::sin instead.      */
+static SIMD_CPPFUNC double __tg_sin(double x) { return ::sin(x); }
+/*! @abstract Do not call this function directly; use simd::tan instead.      */
+static SIMD_CPPFUNC float __tg_tan(float x) { return ::tan(x); }
+/*! @abstract Do not call this function directly; use simd::tan instead.      */
+static SIMD_CPPFUNC double __tg_tan(double x) { return ::tan(x); }
+/*! @abstract Do not call this function directly; use simd::cospi instead.    */
+static SIMD_CPPFUNC float __tg_cospi(float x) { return ::__cospi(x); }
+/*! @abstract Do not call this function directly; use simd::cospi instead.    */
+static SIMD_CPPFUNC double __tg_cospi(double x) { return ::__cospi(x); }
+/*! @abstract Do not call this function directly; use simd::sinpi instead.    */
+static SIMD_CPPFUNC float __tg_sinpi(float x) { return ::__sinpi(x); }
+/*! @abstract Do not call this function directly; use simd::sinpi instead.    */
+static SIMD_CPPFUNC double __tg_sinpi(double x) { return ::__sinpi(x); }
+/*! @abstract Do not call this function directly; use simd::tanpi instead.    */
+static SIMD_CPPFUNC float __tg_tanpi(float x) { return ::__tanpi(x); }
+/*! @abstract Do not call this function directly; use simd::tanpi instead.    */
+static SIMD_CPPFUNC double __tg_tanpi(double x) { return ::__tanpi(x); }
+/*! @abstract Do not call this function directly; use simd::acosh instead.    */
+static SIMD_CPPFUNC float __tg_acosh(float x) { return ::acosh(x); }
+/*! @abstract Do not call this function directly; use simd::acosh instead.    */
+static SIMD_CPPFUNC double __tg_acosh(double x) { return ::acosh(x); }
+/*! @abstract Do not call this function directly; use simd::asinh instead.    */
+static SIMD_CPPFUNC float __tg_asinh(float x) { return ::asinh(x); }
+/*! @abstract Do not call this function directly; use simd::asinh instead.    */
+static SIMD_CPPFUNC double __tg_asinh(double x) { return ::asinh(x); }
+/*! @abstract Do not call this function directly; use simd::atanh instead.    */
+static SIMD_CPPFUNC float __tg_atanh(float x) { return ::atanh(x); }
+/*! @abstract Do not call this function directly; use simd::atanh instead.    */
+static SIMD_CPPFUNC double __tg_atanh(double x) { return ::atanh(x); }
+/*! @abstract Do not call this function directly; use simd::cosh instead.     */
+static SIMD_CPPFUNC float __tg_cosh(float x) { return ::cosh(x); }
+/*! @abstract Do not call this function directly; use simd::cosh instead.     */
+static SIMD_CPPFUNC double __tg_cosh(double x) { return ::cosh(x); }
+/*! @abstract Do not call this function directly; use simd::sinh instead.     */
+static SIMD_CPPFUNC float __tg_sinh(float x) { return ::sinh(x); }
+/*! @abstract Do not call this function directly; use simd::sinh instead.     */
+static SIMD_CPPFUNC double __tg_sinh(double x) { return ::sinh(x); }
+/*! @abstract Do not call this function directly; use simd::tanh instead.     */
+static SIMD_CPPFUNC float __tg_tanh(float x) { return ::tanh(x); }
+/*! @abstract Do not call this function directly; use simd::tanh instead.     */
+static SIMD_CPPFUNC double __tg_tanh(double x) { return ::tanh(x); }
+/*! @abstract Do not call this function directly; use simd::exp instead.      */
+static SIMD_CPPFUNC float __tg_exp(float x) { return ::exp(x); }
+/*! @abstract Do not call this function directly; use simd::exp instead.      */
+static SIMD_CPPFUNC double __tg_exp(double x) { return ::exp(x); }
+/*! @abstract Do not call this function directly; use simd::exp2 instead.     */
+static SIMD_CPPFUNC float __tg_exp2(float x) { return ::exp2(x); }
+/*! @abstract Do not call this function directly; use simd::exp2 instead.     */
+static SIMD_CPPFUNC double __tg_exp2(double x) { return ::exp2(x); }
+/*! @abstract Do not call this function directly; use simd::exp10 instead.    */
+static SIMD_CPPFUNC float __tg_exp10(float x) { return ::__exp10(x); }
+/*! @abstract Do not call this function directly; use simd::exp10 instead.    */
+static SIMD_CPPFUNC double __tg_exp10(double x) { return ::__exp10(x); }
+/*! @abstract Do not call this function directly; use simd::expm1 instead.    */
+static SIMD_CPPFUNC float __tg_expm1(float x) { return ::expm1(x); }
+/*! @abstract Do not call this function directly; use simd::expm1 instead.    */
+static SIMD_CPPFUNC double __tg_expm1(double x) { return ::expm1(x); }
+/*! @abstract Do not call this function directly; use simd::log instead.      */
+static SIMD_CPPFUNC float __tg_log(float x) { return ::log(x); }
+/*! @abstract Do not call this function directly; use simd::log instead.      */
+static SIMD_CPPFUNC double __tg_log(double x) { return ::log(x); }
+/*! @abstract Do not call this function directly; use simd::log2 instead.     */
+static SIMD_CPPFUNC float __tg_log2(float x) { return ::log2(x); }
+/*! @abstract Do not call this function directly; use simd::log2 instead.     */
+static SIMD_CPPFUNC double __tg_log2(double x) { return ::log2(x); }
+/*! @abstract Do not call this function directly; use simd::log10 instead.    */
+static SIMD_CPPFUNC float __tg_log10(float x) { return ::log10(x); }
+/*! @abstract Do not call this function directly; use simd::log10 instead.    */
+static SIMD_CPPFUNC double __tg_log10(double x) { return ::log10(x); }
+/*! @abstract Do not call this function directly; use simd::log1p instead.    */
+static SIMD_CPPFUNC float __tg_log1p(float x) { return ::log1p(x); }
+/*! @abstract Do not call this function directly; use simd::log1p instead.    */
+static SIMD_CPPFUNC double __tg_log1p(double x) { return ::log1p(x); }
+/*! @abstract Do not call this function directly; use simd::fabs instead.     */
+static SIMD_CPPFUNC float __tg_fabs(float x) { return ::fabs(x); }
+/*! @abstract Do not call this function directly; use simd::fabs instead.     */
+static SIMD_CPPFUNC double __tg_fabs(double x) { return ::fabs(x); }
+/*! @abstract Do not call this function directly; use simd::cbrt instead.     */
+static SIMD_CPPFUNC float __tg_cbrt(float x) { return ::cbrt(x); }
+/*! @abstract Do not call this function directly; use simd::cbrt instead.     */
+static SIMD_CPPFUNC double __tg_cbrt(double x) { return ::cbrt(x); }
+/*! @abstract Do not call this function directly; use simd::sqrt instead.     */
+static SIMD_CPPFUNC float __tg_sqrt(float x) { return ::sqrt(x); }
+/*! @abstract Do not call this function directly; use simd::sqrt instead.     */
+static SIMD_CPPFUNC double __tg_sqrt(double x) { return ::sqrt(x); }
+/*! @abstract Do not call this function directly; use simd::erf instead.      */
+static SIMD_CPPFUNC float __tg_erf(float x) { return ::erf(x); }
+/*! @abstract Do not call this function directly; use simd::erf instead.      */
+static SIMD_CPPFUNC double __tg_erf(double x) { return ::erf(x); }
+/*! @abstract Do not call this function directly; use simd::erfc instead.     */
+static SIMD_CPPFUNC float __tg_erfc(float x) { return ::erfc(x); }
+/*! @abstract Do not call this function directly; use simd::erfc instead.     */
+static SIMD_CPPFUNC double __tg_erfc(double x) { return ::erfc(x); }
+/*! @abstract Do not call this function directly; use simd::tgamma instead.   */
+static SIMD_CPPFUNC float __tg_tgamma(float x) { return ::tgamma(x); }
+/*! @abstract Do not call this function directly; use simd::tgamma instead.   */
+static SIMD_CPPFUNC double __tg_tgamma(double x) { return ::tgamma(x); }
+/*! @abstract Do not call this function directly; use simd::ceil instead.     */
+static SIMD_CPPFUNC float __tg_ceil(float x) { return ::ceil(x); }
+/*! @abstract Do not call this function directly; use simd::ceil instead.     */
+static SIMD_CPPFUNC double __tg_ceil(double x) { return ::ceil(x); }
+/*! @abstract Do not call this function directly; use simd::floor instead.    */
+static SIMD_CPPFUNC float __tg_floor(float x) { return ::floor(x); }
+/*! @abstract Do not call this function directly; use simd::floor instead.    */
+static SIMD_CPPFUNC double __tg_floor(double x) { return ::floor(x); }
+/*! @abstract Do not call this function directly; use simd::rint instead.     */
+static SIMD_CPPFUNC float __tg_rint(float x) { return ::rint(x); }
+/*! @abstract Do not call this function directly; use simd::rint instead.     */
+static SIMD_CPPFUNC double __tg_rint(double x) { return ::rint(x); }
+/*! @abstract Do not call this function directly; use simd::round instead.    */
+static SIMD_CPPFUNC float __tg_round(float x) { return ::round(x); }
+/*! @abstract Do not call this function directly; use simd::round instead.    */
+static SIMD_CPPFUNC double __tg_round(double x) { return ::round(x); }
+/*! @abstract Do not call this function directly; use simd::trunc instead.    */
+static SIMD_CPPFUNC float __tg_trunc(float x) { return ::trunc(x); }
+/*! @abstract Do not call this function directly; use simd::trunc instead.    */
+static SIMD_CPPFUNC double __tg_trunc(double x) { return ::trunc(x); }
+/*! @abstract Do not call this function directly; use simd::atan2 instead.    */
+static SIMD_CPPFUNC float __tg_atan2(float x, float y) { return ::atan2(x, y); }
+/*! @abstract Do not call this function directly; use simd::atan2 instead.    */
+static SIMD_CPPFUNC double __tg_atan2(double x, float y) { return ::atan2(x, y); }
+/*! @abstract Do not call this function directly; use simd::hypot instead.    */
+static SIMD_CPPFUNC float __tg_hypot(float x, float y) { return ::hypot(x, y); }
+/*! @abstract Do not call this function directly; use simd::hypot instead.    */
+static SIMD_CPPFUNC double __tg_hypot(double x, float y) { return ::hypot(x, y); }
+/*! @abstract Do not call this function directly; use simd::pow instead.      */
+static SIMD_CPPFUNC float __tg_pow(float x, float y) { return ::pow(x, y); }
+/*! @abstract Do not call this function directly; use simd::pow instead.      */
+static SIMD_CPPFUNC double __tg_pow(double x, float y) { return ::pow(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmod instead.     */
+static SIMD_CPPFUNC float __tg_fmod(float x, float y) { return ::fmod(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmod instead.     */
+static SIMD_CPPFUNC double __tg_fmod(double x, float y) { return ::fmod(x, y); }
+/*! @abstract Do not call this function directly; use simd::remainder
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_remainder(float x, float y) { return ::remainder(x, y); }
+/*! @abstract Do not call this function directly; use simd::remainder
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_remainder(double x, float y) { return ::remainder(x, y); }
+/*! @abstract Do not call this function directly; use simd::copysign
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_copysign(float x, float y) { return ::copysign(x, y); }
+/*! @abstract Do not call this function directly; use simd::copysign
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_copysign(double x, float y) { return ::copysign(x, y); }
+/*! @abstract Do not call this function directly; use simd::nextafter
+ *  instead.                                                                  */
+static SIMD_CPPFUNC float __tg_nextafter(float x, float y) { return ::nextafter(x, y); }
+/*! @abstract Do not call this function directly; use simd::nextafter
+ *  instead.                                                                  */
+static SIMD_CPPFUNC double __tg_nextafter(double x, float y) { return ::nextafter(x, y); }
+/*! @abstract Do not call this function directly; use simd::fdim instead.     */
+static SIMD_CPPFUNC float __tg_fdim(float x, float y) { return ::fdim(x, y); }
+/*! @abstract Do not call this function directly; use simd::fdim instead.     */
+static SIMD_CPPFUNC double __tg_fdim(double x, float y) { return ::fdim(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmax instead.     */
+static SIMD_CPPFUNC float __tg_fmax(float x, float y) { return ::fmax(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmax instead.     */
+static SIMD_CPPFUNC double __tg_fmax(double x, float y) { return ::fmax(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmin instead.     */
+static SIMD_CPPFUNC float __tg_fmin(float x, float y) { return ::fmin(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmin instead.     */
+static SIMD_CPPFUNC double __tg_fmin(double x, float y) { return ::fmin(x, y); }
+/*! @abstract Do not call this function directly; use simd::fma instead.      */
+static SIMD_CPPFUNC float __tg_fma(float x, float y, float z) { return ::fma(x, y, z); }
+/*! @abstract Do not call this function directly; use simd::fma instead.      */
+static SIMD_CPPFUNC double __tg_fma(double x, double y, double z) { return ::fma(x, y, z); }
+  
+namespace simd {
+/*! @abstract Generalizes the <cmath> function acos to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN acos(fptypeN x) { return ::__tg_acos(x); }
+  
+/*! @abstract Generalizes the <cmath> function asin to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN asin(fptypeN x) { return ::__tg_asin(x); }
+  
+/*! @abstract Generalizes the <cmath> function atan to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN atan(fptypeN x) { return ::__tg_atan(x); }
+  
+/*! @abstract Generalizes the <cmath> function cos to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cos(fptypeN x) { return ::__tg_cos(x); }
+  
+/*! @abstract Generalizes the <cmath> function sin to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sin(fptypeN x) { return ::__tg_sin(x); }
+  
+/*! @abstract Generalizes the <cmath> function tan to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tan(fptypeN x) { return ::__tg_tan(x); }
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function cospi to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cospi(fptypeN x) { return ::__tg_cospi(x); }
+#endif
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function sinpi to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sinpi(fptypeN x) { return ::__tg_sinpi(x); }
+#endif
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function tanpi to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tanpi(fptypeN x) { return ::__tg_tanpi(x); }
+#endif
+  
+/*! @abstract Generalizes the <cmath> function acosh to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN acosh(fptypeN x) { return ::__tg_acosh(x); }
+  
+/*! @abstract Generalizes the <cmath> function asinh to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN asinh(fptypeN x) { return ::__tg_asinh(x); }
+  
+/*! @abstract Generalizes the <cmath> function atanh to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN atanh(fptypeN x) { return ::__tg_atanh(x); }
+  
+/*! @abstract Generalizes the <cmath> function cosh to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cosh(fptypeN x) { return ::__tg_cosh(x); }
+  
+/*! @abstract Generalizes the <cmath> function sinh to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sinh(fptypeN x) { return ::__tg_sinh(x); }
+  
+/*! @abstract Generalizes the <cmath> function tanh to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tanh(fptypeN x) { return ::__tg_tanh(x); }
+  
+/*! @abstract Generalizes the <cmath> function exp to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN exp(fptypeN x) { return ::__tg_exp(x); }
+  
+/*! @abstract Generalizes the <cmath> function exp2 to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN exp2(fptypeN x) { return ::__tg_exp2(x); }
+  
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function exp10 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN exp10(fptypeN x) { return ::__tg_exp10(x); }
+#endif
+  
+/*! @abstract Generalizes the <cmath> function expm1 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN expm1(fptypeN x) { return ::__tg_expm1(x); }
+  
+/*! @abstract Generalizes the <cmath> function log to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log(fptypeN x) { return ::__tg_log(x); }
+  
+/*! @abstract Generalizes the <cmath> function log2 to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log2(fptypeN x) { return ::__tg_log2(x); }
+  
+/*! @abstract Generalizes the <cmath> function log10 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log10(fptypeN x) { return ::__tg_log10(x); }
+  
+/*! @abstract Generalizes the <cmath> function log1p to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN log1p(fptypeN x) { return ::__tg_log1p(x); }
+  
+/*! @abstract Generalizes the <cmath> function fabs to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fabs(fptypeN x) { return ::__tg_fabs(x); }
+  
+/*! @abstract Generalizes the <cmath> function cbrt to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN cbrt(fptypeN x) { return ::__tg_cbrt(x); }
+  
+/*! @abstract Generalizes the <cmath> function sqrt to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN sqrt(fptypeN x) { return ::__tg_sqrt(x); }
+  
+/*! @abstract Generalizes the <cmath> function erf to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN erf(fptypeN x) { return ::__tg_erf(x); }
+  
+/*! @abstract Generalizes the <cmath> function erfc to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN erfc(fptypeN x) { return ::__tg_erfc(x); }
+  
+/*! @abstract Generalizes the <cmath> function tgamma to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN tgamma(fptypeN x) { return ::__tg_tgamma(x); }
+  
+/*! @abstract Generalizes the <cmath> function ceil to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN ceil(fptypeN x) { return ::__tg_ceil(x); }
+  
+/*! @abstract Generalizes the <cmath> function floor to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN floor(fptypeN x) { return ::__tg_floor(x); }
+  
+/*! @abstract Generalizes the <cmath> function rint to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN rint(fptypeN x) { return ::__tg_rint(x); }
+  
+/*! @abstract Generalizes the <cmath> function round to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN round(fptypeN x) { return ::__tg_round(x); }
+  
+/*! @abstract Generalizes the <cmath> function trunc to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN trunc(fptypeN x) { return ::__tg_trunc(x); }
+  
+/*! @abstract Generalizes the <cmath> function atan2 to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN atan2(fptypeN y, fptypeN x) { return ::__tg_atan2(y, x); }
+    
+/*! @abstract Generalizes the <cmath> function hypot to operate on vectors
+ *  of floats and doubles.                                                    */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN hypot(fptypeN x, fptypeN y) { return ::__tg_hypot(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function pow to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN pow(fptypeN x, fptypeN y) { return ::__tg_pow(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fmod to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fmod(fptypeN x, fptypeN y) { return ::__tg_fmod(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function remainder to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN remainder(fptypeN x, fptypeN y) { return ::__tg_remainder(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function copysign to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN copysign(fptypeN x, fptypeN y) { return ::__tg_copysign(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function nextafter to operate on
+ *  vectors of floats and doubles.                                            */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN nextafter(fptypeN x, fptypeN y) { return ::__tg_nextafter(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fdim to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fdim(fptypeN x, fptypeN y) { return ::__tg_fdim(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fmax to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fmax(fptypeN x, fptypeN y) { return ::__tg_fmax(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fmin to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fmin(fptypeN x, fptypeN y) { return ::__tg_fmin(x, y); }
+    
+/*! @abstract Generalizes the <cmath> function fma to operate on vectors of
+ *  floats and doubles.                                                       */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN fma(fptypeN x, fptypeN y, fptypeN z) { return ::__tg_fma(x, y, z); }
+        
+/*! @abstract Computes x*y + z by the most efficient means available; either
+ *  a fused multiply add or separate multiply and add.                        */
+  template <typename fptypeN>
+  static SIMD_CPPFUNC fptypeN muladd(fptypeN x, fptypeN y, fptypeN z) { return ::simd_muladd(x, y, z); }
+};
+
+extern "C" {
+#else
+#include <tgmath.h>
+/* C and Objective-C, we need some infrastructure to piggyback on tgmath.h    */
+static SIMD_OVERLOAD simd_float2 __tg_promote(simd_float2);
+static SIMD_OVERLOAD simd_float3 __tg_promote(simd_float3);
+static SIMD_OVERLOAD simd_float4 __tg_promote(simd_float4);
+static SIMD_OVERLOAD simd_float8 __tg_promote(simd_float8);
+static SIMD_OVERLOAD simd_float16 __tg_promote(simd_float16);
+static SIMD_OVERLOAD simd_double2 __tg_promote(simd_double2);
+static SIMD_OVERLOAD simd_double3 __tg_promote(simd_double3);
+static SIMD_OVERLOAD simd_double4 __tg_promote(simd_double4);
+static SIMD_OVERLOAD simd_double8 __tg_promote(simd_double8);
+
+/*  Apple extensions to <math.h>, added in macOS 10.9 and iOS 7.0             */
+#if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_9   || \
+    __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_7_0 || \
+    __DRIVERKIT_VERSION_MIN_REQUIRED >= __DRIVERKIT_19_0
+static inline SIMD_CFUNC float __tg_cospi(float x) { return __cospif(x); }
+static inline SIMD_CFUNC double __tg_cospi(double x) { return __cospi(x); }
+#undef cospi
+/*! @abstract `cospi(x)` computes `cos(pi * x)` without intermediate rounding.
+ *
+ *  @discussion Both faster and more accurate than multiplying by `pi` and then
+ *  calling `cos`. Defined for `float` and `double` as well as vectors of
+ *  floats and doubles as provided by `<simd/simd.h>`.                        */
+#define cospi(__x) __tg_cospi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_sinpi(float x) { return __sinpif(x); }
+static inline SIMD_CFUNC double __tg_sinpi(double x) { return __sinpi(x); }
+#undef sinpi
+/*! @abstract `sinpi(x)` computes `sin(pi * x)` without intermediate rounding.
+ *
+ *  @discussion Both faster and more accurate than multiplying by `pi` and then
+ *  calling `sin`. Defined for `float` and `double` as well as vectors
+ *  of floats and doubles as provided by `<simd/simd.h>`.                     */
+#define sinpi(__x) __tg_sinpi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_tanpi(float x) { return __tanpif(x); }
+static inline SIMD_CFUNC double __tg_tanpi(double x) { return __tanpi(x); }
+#undef tanpi
+/*! @abstract `tanpi(x)` computes `tan(pi * x)` without intermediate rounding.
+ *
+ *  @discussion Both faster and more accurate than multiplying by `pi` and then
+ *  calling `tan`. Defined for `float` and `double` as well as vectors of
+ *  floats and doubles as provided by `<simd/simd.h>`.                        */
+#define tanpi(__x) __tg_tanpi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_exp10(float x) { return __exp10f(x); }
+static inline SIMD_CFUNC double __tg_exp10(double x) { return __exp10(x); }
+#undef exp10
+/*! @abstract `exp10(x)` computes `10**x` more efficiently and accurately
+ *  than `pow(10, x)`.
+ *
+ *  @discussion Defined for `float` and `double` as well as vectors of floats
+ *  and doubles as provided by `<simd/simd.h>`.                               */
+#define exp10(__x) __tg_exp10(__tg_promote1((__x))(__x))
+#endif
+
+  
+#endif /* !__cplusplus */
+  
+#pragma mark - fabs implementation
+static inline SIMD_CFUNC simd_float2 __tg_fabs(simd_float2 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float3 __tg_fabs(simd_float3 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float4 __tg_fabs(simd_float4 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float8 __tg_fabs(simd_float8 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float16 __tg_fabs(simd_float16 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_double2 __tg_fabs(simd_double2 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double3 __tg_fabs(simd_double3 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double4 __tg_fabs(simd_double4 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double8 __tg_fabs(simd_double8 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+  
+#pragma mark - fmin, fmax implementation
+static SIMD_CFUNC simd_float2 __tg_fmin(simd_float2 x, simd_float2 y) {
+#if defined __SSE2__
+  return simd_make_float2(__tg_fmin(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+#elif defined __arm64__
+  return vminnm_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vmin_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_fmin(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_fmin(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_float4 __tg_fmin(simd_float4 x, simd_float4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm_range_ps(x, y, 4);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_min_ps(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_min_ps(x, y), x, y != y);
+#elif defined __arm64__
+  return vminnmq_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vminq_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float8 __tg_fmin(simd_float8 x, simd_float8 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm256_range_ps(x, y, 4);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_min_ps(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_min_ps(x, y), x, y != y);
+#else
+  return simd_make_float8(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_float16 __tg_fmin(simd_float16 x, simd_float16 y) {
+#if defined __x86_64__ && defined __AVX512DQ__ && !__FINITE_MATH_ONLY__
+  return _mm512_range_ps(x, y, 4);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_min_ps(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_min_ps(x, y), x, y != y);
+#else
+  return simd_make_float16(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_double2 __tg_fmin(simd_double2 x, simd_double2 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm_range_pd(x, y, 4);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_min_pd(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_min_pd(x, y), x, y != y);
+#elif defined __arm64__
+  return vminnmq_f64(x, y);
+#else
+  return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_fmin(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_fmin(simd_make_double4_undef(x), simd_make_double4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_double4 __tg_fmin(simd_double4 x, simd_double4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm256_range_pd(x, y, 4);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_min_pd(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_min_pd(x, y), x, y != y);
+#else
+  return simd_make_double4(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_fmin(simd_double8 x, simd_double8 y) {
+#if defined __x86_64__ && defined __AVX512DQ__
+  return _mm512_range_pd(x, y, 4);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_min_pd(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_min_pd(x, y), x, y != y);
+#else
+  return simd_make_double8(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_fmax(simd_float2 x, simd_float2 y) {
+#if defined __SSE2__
+  return simd_make_float2(__tg_fmax(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+#elif defined __arm64__
+  return vmaxnm_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vmax_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_fmax(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_fmax(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_float4 __tg_fmax(simd_float4 x, simd_float4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm_range_ps(x, y, 5);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_max_ps(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_max_ps(x, y), x, y != y);
+#elif defined __arm64__
+  return vmaxnmq_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+  return vmaxq_f32(x, y);
+#else
+  return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_float8 __tg_fmax(simd_float8 x, simd_float8 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+  return _mm256_range_ps(x, y, 5);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_max_ps(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_max_ps(x, y), x, y != y);
+#else
+  return simd_make_float8(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_float16 __tg_fmax(simd_float16 x, simd_float16 y) {
+#if defined __x86_64__ && defined __AVX512DQ__ && !__FINITE_MATH_ONLY__
+  return _mm512_range_ps(x, y, 5);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_max_ps(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_max_ps(x, y), x, y != y);
+#else
+  return simd_make_float16(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_double2 __tg_fmax(simd_double2 x, simd_double2 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm_range_pd(x, y, 5);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+  return _mm_max_pd(x, y);
+#elif defined __SSE2__
+  return simd_bitselect(_mm_max_pd(x, y), x, y != y);
+#elif defined __arm64__
+  return vmaxnmq_f64(x, y);
+#else
+  return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_fmax(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_fmax(simd_make_double4_undef(x), simd_make_double4_undef(y)));
+}
+  
+static SIMD_CFUNC simd_double4 __tg_fmax(simd_double4 x, simd_double4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+  return _mm256_range_pd(x, y, 5);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+  return _mm256_max_pd(x, y);
+#elif defined __AVX__
+  return simd_bitselect(_mm256_max_pd(x, y), x, y != y);
+#else
+  return simd_make_double4(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_fmax(simd_double8 x, simd_double8 y) {
+#if defined __x86_64__ && defined __AVX512DQ__
+  return _mm512_range_pd(x, y, 5);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+  return _mm512_max_pd(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+  return simd_bitselect(_mm512_max_pd(x, y), x, y != y);
+#else
+  return simd_make_double8(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+#pragma mark - copysign implementation
+static inline SIMD_CFUNC simd_float2 __tg_copysign(simd_float2 x, simd_float2 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float3 __tg_copysign(simd_float3 x, simd_float3 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float4 __tg_copysign(simd_float4 x, simd_float4 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float8 __tg_copysign(simd_float8 x, simd_float8 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float16 __tg_copysign(simd_float16 x, simd_float16 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_double2 __tg_copysign(simd_double2 x, simd_double2 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double3 __tg_copysign(simd_double3 x, simd_double3 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double4 __tg_copysign(simd_double4 x, simd_double4 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double8 __tg_copysign(simd_double8 x, simd_double8 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+  
+#pragma mark - sqrt implementation
+static SIMD_CFUNC simd_float2 __tg_sqrt(simd_float2 x) {
+#if defined __SSE2__
+  return simd_make_float2(__tg_sqrt(simd_make_float4_undef(x)));
+#elif defined __arm64__
+  return vsqrt_f32(x);
+#else
+  return simd_make_float2(sqrt(x.x), sqrt(x.y));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_sqrt(simd_float3 x) {
+  return simd_make_float3(__tg_sqrt(simd_make_float4_undef(x)));
+}
+
+static SIMD_CFUNC simd_float4 __tg_sqrt(simd_float4 x) {
+#if defined __SSE2__
+  return _mm_sqrt_ps(x);
+#elif defined __arm64__
+  return vsqrtq_f32(x);
+#else
+  return simd_make_float4(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_sqrt(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_sqrt_ps(x);
+#else
+  return simd_make_float8(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_float16 __tg_sqrt(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_sqrt_ps(x);
+#else
+  return simd_make_float16(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double2 __tg_sqrt(simd_double2 x) {
+#if defined __SSE2__
+  return _mm_sqrt_pd(x);
+#elif defined __arm64__
+  return vsqrtq_f64(x);
+#else
+  return simd_make_double2(sqrt(x.x), sqrt(x.y));
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_sqrt(simd_double3 x) {
+  return simd_make_double3(__tg_sqrt(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_sqrt(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_sqrt_pd(x);
+#else
+  return simd_make_double4(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+  
+static SIMD_CFUNC simd_double8 __tg_sqrt(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_sqrt_pd(x);
+#else
+  return simd_make_double8(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+  
+#pragma mark - ceil, floor, rint, trunc implementation
+static SIMD_CFUNC simd_float2 __tg_ceil(simd_float2 x) {
+#if defined __arm64__
+  return vrndp_f32(x);
+#else
+  return simd_make_float2(__tg_ceil(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_ceil(simd_float3 x) {
+  return simd_make_float3(__tg_ceil(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_ceil_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_ceil(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndpq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_ceil_f4(x);
+#else
+  simd_float4 truncated = __tg_trunc(x);
+  simd_float4 adjust = simd_bitselect((simd_float4)0, 1, truncated < x);
+  return __tg_copysign(truncated + adjust, x);
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_ceil(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float8(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_ceil(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float16(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_ceil_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_ceil(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndpq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_ceil_d2(x);
+#else
+  simd_double2 truncated = __tg_trunc(x);
+  simd_double2 adjust = simd_bitselect((simd_double2)0, 1, truncated < x);
+  return __tg_copysign(truncated + adjust, x);
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_ceil(simd_double3 x) {
+  return simd_make_double3(__tg_ceil(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_ceil(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double4(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_ceil(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double8(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_floor(simd_float2 x) {
+#if defined __arm64__
+  return vrndm_f32(x);
+#else
+  return simd_make_float2(__tg_floor(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_floor(simd_float3 x) {
+  return simd_make_float3(__tg_floor(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_floor_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_floor(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndmq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_floor_f4(x);
+#else
+  simd_float4 truncated = __tg_trunc(x);
+  simd_float4 adjust = simd_bitselect((simd_float4)0, 1, truncated > x);
+  return truncated - adjust;
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_floor(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float8(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_floor(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float16(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_floor_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_floor(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndmq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_floor_d2(x);
+#else
+  simd_double2 truncated = __tg_trunc(x);
+  simd_double2 adjust = simd_bitselect((simd_double2)0, 1, truncated > x);
+  return truncated - adjust;
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_floor(simd_double3 x) {
+  return simd_make_double3(__tg_floor(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_floor(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double4(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_floor(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double8(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_rint(simd_float2 x) {
+#if defined __arm64__
+  return vrndx_f32(x);
+#else
+  return simd_make_float2(__tg_rint(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_rint(simd_float3 x) {
+  return simd_make_float3(__tg_rint(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_rint_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_rint(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_RINT);
+#elif defined __arm64__
+  return vrndxq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_rint_f4(x);
+#else
+  simd_float4 magic = __tg_copysign(0x1.0p23, x);
+  simd_int4 x_is_small = __tg_fabs(x) < 0x1.0p23;
+  return simd_bitselect(x, (x + magic) - magic, x_is_small & 0x7fffffff);
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_rint(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_RINT);
+#else
+  return simd_make_float8(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_rint(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_RINT);
+#else
+  return simd_make_float16(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_rint_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_rint(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_RINT);
+#elif defined __arm64__
+  return vrndxq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_rint_d2(x);
+#else
+  simd_double2 magic = __tg_copysign(0x1.0p52, x);
+  simd_long2 x_is_small = __tg_fabs(x) < 0x1.0p52;
+  return simd_bitselect(x, (x + magic) - magic, x_is_small & 0x7fffffffffffffff);
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_rint(simd_double3 x) {
+  return simd_make_double3(__tg_rint(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_rint(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_RINT);
+#else
+  return simd_make_double4(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_rint(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_RINT);
+#else
+  return simd_make_double8(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_trunc(simd_float2 x) {
+#if defined __arm64__
+  return vrnd_f32(x);
+#else
+  return simd_make_float2(__tg_trunc(simd_make_float4_undef(x)));
+#endif
+}
+  
+static SIMD_CFUNC simd_float3 __tg_trunc(simd_float3 x) {
+  return simd_make_float3(__tg_trunc(simd_make_float4_undef(x)));
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_trunc_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_trunc(simd_float4 x) {
+#if defined __SSE4_1__
+  return _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_trunc_f4(x);
+#else
+  simd_float4 binade = simd_bitselect(0, x, 0x7f800000);
+  simd_int4 mask = (simd_int4)__tg_fmin(-2*binade + 1, -0);
+  simd_float4 result = simd_bitselect(0, x, mask);
+  return simd_bitselect(x, result, binade < 0x1.0p23);
+#endif
+}
+ 
+static SIMD_CFUNC simd_float8 __tg_trunc(simd_float8 x) {
+#if defined __AVX__
+  return _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float8(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_float16 __tg_trunc(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_float16(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+  
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_trunc_d2(simd_double2 x);
+#endif
+  
+static SIMD_CFUNC simd_double2 __tg_trunc(simd_double2 x) {
+#if defined __SSE4_1__
+  return _mm_round_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+  return vrndq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+  return _simd_trunc_d2(x);
+#else
+  simd_double2 binade = simd_bitselect(0, x, 0x7ff0000000000000);
+  simd_long2 mask = (simd_long2)__tg_fmin(-2*binade + 1, -0);
+  simd_double2 result = simd_bitselect(0, x, mask);
+  return simd_bitselect(x, result, binade < 0x1.0p52);
+#endif
+}
+  
+static SIMD_CFUNC simd_double3 __tg_trunc(simd_double3 x) {
+  return simd_make_double3(__tg_trunc(simd_make_double4_undef(x)));
+}
+ 
+static SIMD_CFUNC simd_double4 __tg_trunc(simd_double4 x) {
+#if defined __AVX__
+  return _mm256_round_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double4(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+ 
+static SIMD_CFUNC simd_double8 __tg_trunc(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_roundscale_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+  return simd_make_double8(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+
+#pragma mark - sine, cosine implementation
+static inline SIMD_CFUNC simd_float2 __tg_sin(simd_float2 x) {
+  return simd_make_float2(__tg_sin(simd_make_float4(x)));
+}
+  
+static inline SIMD_CFUNC simd_float3 __tg_sin(simd_float3 x) {
+  return simd_make_float3(__tg_sin(simd_make_float4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+  return _simd_sin_f4(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_float4 __sin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+  return __sin_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+  return simd_make_float4(sin(x.x), sin(x.y), sin(x.z), sin(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sin_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x) {
+  return _simd_sin_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x) {
+  return simd_make_float8(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sin_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x) {
+  return _simd_sin_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x) {
+  return simd_make_float16(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+  return _simd_sin_d2(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_double2 __sin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+  return __sin_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+  return simd_make_double2(sin(x.x), sin(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sin(simd_double3 x) {
+  return simd_make_double3(__tg_sin(simd_make_double4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sin_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x) {
+  return _simd_sin_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x) {
+  return simd_make_double4(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sin_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x) {
+  return _simd_sin_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x) {
+  return simd_make_double8(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float2 __tg_cos(simd_float2 x) {
+  return simd_make_float2(__tg_cos(simd_make_float4(x)));
+}
+  
+static inline SIMD_CFUNC simd_float3 __tg_cos(simd_float3 x) {
+  return simd_make_float3(__tg_cos(simd_make_float4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+  return _simd_cos_f4(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_float4 __cos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+  return __cos_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+  return simd_make_float4(cos(x.x), cos(x.y), cos(x.z), cos(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cos_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x) {
+  return _simd_cos_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x) {
+  return simd_make_float8(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cos_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x) {
+  return _simd_cos_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x) {
+  return simd_make_float16(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+  return _simd_cos_d2(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_double2 __cos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+  return __cos_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+  return simd_make_double2(cos(x.x), cos(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cos(simd_double3 x) {
+  return simd_make_double3(__tg_cos(simd_make_double4(x)));
+}
+  
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cos_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x) {
+  return _simd_cos_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x) {
+  return simd_make_double4(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cos_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x) {
+  return _simd_cos_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x) {
+  return simd_make_double8(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+  
+#pragma mark - acos implementation
+static inline SIMD_CFUNC simd_float2 __tg_acos(simd_float2 x) {
+  return simd_make_float2(__tg_acos(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_acos(simd_float3 x) {
+  return simd_make_float3(__tg_acos(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_acos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x) {
+  return _simd_acos_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x) {
+  return simd_make_float4(acos(x.x), acos(x.y), acos(x.z), acos(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_acos_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x) {
+  return _simd_acos_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x) {
+  return simd_make_float8(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_acos_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x) {
+  return _simd_acos_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x) {
+  return simd_make_float16(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_acos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x) {
+  return _simd_acos_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x) {
+  return simd_make_double2(acos(x.x), acos(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_acos(simd_double3 x) {
+  return simd_make_double3(__tg_acos(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_acos_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x) {
+  return _simd_acos_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x) {
+  return simd_make_double4(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_acos_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x) {
+  return _simd_acos_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x) {
+  return simd_make_double8(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#pragma mark - asin implementation
+static inline SIMD_CFUNC simd_float2 __tg_asin(simd_float2 x) {
+  return simd_make_float2(__tg_asin(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_asin(simd_float3 x) {
+  return simd_make_float3(__tg_asin(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_asin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x) {
+  return _simd_asin_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x) {
+  return simd_make_float4(asin(x.x), asin(x.y), asin(x.z), asin(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_asin_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x) {
+  return _simd_asin_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x) {
+  return simd_make_float8(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_asin_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x) {
+  return _simd_asin_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x) {
+  return simd_make_float16(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_asin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x) {
+  return _simd_asin_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x) {
+  return simd_make_double2(asin(x.x), asin(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_asin(simd_double3 x) {
+  return simd_make_double3(__tg_asin(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_asin_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x) {
+  return _simd_asin_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x) {
+  return simd_make_double4(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_asin_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x) {
+  return _simd_asin_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x) {
+  return simd_make_double8(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#pragma mark - atan implementation
+static inline SIMD_CFUNC simd_float2 __tg_atan(simd_float2 x) {
+  return simd_make_float2(__tg_atan(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atan(simd_float3 x) {
+  return simd_make_float3(__tg_atan(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atan_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x) {
+  return _simd_atan_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x) {
+  return simd_make_float4(atan(x.x), atan(x.y), atan(x.z), atan(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atan_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x) {
+  return _simd_atan_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x) {
+  return simd_make_float8(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atan_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x) {
+  return _simd_atan_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x) {
+  return simd_make_float16(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atan_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x) {
+  return _simd_atan_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x) {
+  return simd_make_double2(atan(x.x), atan(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atan(simd_double3 x) {
+  return simd_make_double3(__tg_atan(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atan_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x) {
+  return _simd_atan_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x) {
+  return simd_make_double4(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atan_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x) {
+  return _simd_atan_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x) {
+  return simd_make_double8(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#pragma mark - tan implementation
+static inline SIMD_CFUNC simd_float2 __tg_tan(simd_float2 x) {
+  return simd_make_float2(__tg_tan(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tan(simd_float3 x) {
+  return simd_make_float3(__tg_tan(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tan_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x) {
+  return _simd_tan_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x) {
+  return simd_make_float4(tan(x.x), tan(x.y), tan(x.z), tan(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tan_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x) {
+  return _simd_tan_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x) {
+  return simd_make_float8(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tan_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x) {
+  return _simd_tan_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x) {
+  return simd_make_float16(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tan_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x) {
+  return _simd_tan_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x) {
+  return simd_make_double2(tan(x.x), tan(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tan(simd_double3 x) {
+  return simd_make_double3(__tg_tan(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tan_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x) {
+  return _simd_tan_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x) {
+  return simd_make_double4(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tan_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x) {
+  return _simd_tan_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x) {
+  return simd_make_double8(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#pragma mark - cospi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_cospi(simd_float2 x) {
+  return simd_make_float2(__tg_cospi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cospi(simd_float3 x) {
+  return simd_make_float3(__tg_cospi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cospi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x) {
+  return _simd_cospi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x) {
+  return simd_make_float4(__cospi(x.x), __cospi(x.y), __cospi(x.z), __cospi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cospi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x) {
+  return _simd_cospi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x) {
+  return simd_make_float8(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cospi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x) {
+  return _simd_cospi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x) {
+  return simd_make_float16(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cospi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x) {
+  return _simd_cospi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x) {
+  return simd_make_double2(__cospi(x.x), __cospi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cospi(simd_double3 x) {
+  return simd_make_double3(__tg_cospi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cospi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x) {
+  return _simd_cospi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x) {
+  return simd_make_double4(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cospi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x) {
+  return _simd_cospi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x) {
+  return simd_make_double8(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - sinpi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_sinpi(simd_float2 x) {
+  return simd_make_float2(__tg_sinpi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sinpi(simd_float3 x) {
+  return simd_make_float3(__tg_sinpi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sinpi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x) {
+  return _simd_sinpi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x) {
+  return simd_make_float4(__sinpi(x.x), __sinpi(x.y), __sinpi(x.z), __sinpi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sinpi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x) {
+  return _simd_sinpi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x) {
+  return simd_make_float8(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sinpi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x) {
+  return _simd_sinpi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x) {
+  return simd_make_float16(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sinpi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x) {
+  return _simd_sinpi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x) {
+  return simd_make_double2(__sinpi(x.x), __sinpi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sinpi(simd_double3 x) {
+  return simd_make_double3(__tg_sinpi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sinpi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x) {
+  return _simd_sinpi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x) {
+  return simd_make_double4(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sinpi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x) {
+  return _simd_sinpi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x) {
+  return simd_make_double8(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - tanpi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_tanpi(simd_float2 x) {
+  return simd_make_float2(__tg_tanpi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tanpi(simd_float3 x) {
+  return simd_make_float3(__tg_tanpi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tanpi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x) {
+  return _simd_tanpi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x) {
+  return simd_make_float4(__tanpi(x.x), __tanpi(x.y), __tanpi(x.z), __tanpi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tanpi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x) {
+  return _simd_tanpi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x) {
+  return simd_make_float8(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tanpi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x) {
+  return _simd_tanpi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x) {
+  return simd_make_float16(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tanpi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x) {
+  return _simd_tanpi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x) {
+  return simd_make_double2(__tanpi(x.x), __tanpi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tanpi(simd_double3 x) {
+  return simd_make_double3(__tg_tanpi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tanpi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x) {
+  return _simd_tanpi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x) {
+  return simd_make_double4(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tanpi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x) {
+  return _simd_tanpi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x) {
+  return simd_make_double8(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - acosh implementation
+static inline SIMD_CFUNC simd_float2 __tg_acosh(simd_float2 x) {
+  return simd_make_float2(__tg_acosh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_acosh(simd_float3 x) {
+  return simd_make_float3(__tg_acosh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_acosh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x) {
+  return _simd_acosh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x) {
+  return simd_make_float4(acosh(x.x), acosh(x.y), acosh(x.z), acosh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_acosh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x) {
+  return _simd_acosh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x) {
+  return simd_make_float8(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_acosh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x) {
+  return _simd_acosh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x) {
+  return simd_make_float16(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_acosh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x) {
+  return _simd_acosh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x) {
+  return simd_make_double2(acosh(x.x), acosh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_acosh(simd_double3 x) {
+  return simd_make_double3(__tg_acosh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_acosh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x) {
+  return _simd_acosh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x) {
+  return simd_make_double4(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_acosh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x) {
+  return _simd_acosh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x) {
+  return simd_make_double8(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#pragma mark - asinh implementation
+static inline SIMD_CFUNC simd_float2 __tg_asinh(simd_float2 x) {
+  return simd_make_float2(__tg_asinh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_asinh(simd_float3 x) {
+  return simd_make_float3(__tg_asinh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_asinh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x) {
+  return _simd_asinh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x) {
+  return simd_make_float4(asinh(x.x), asinh(x.y), asinh(x.z), asinh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_asinh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x) {
+  return _simd_asinh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x) {
+  return simd_make_float8(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_asinh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x) {
+  return _simd_asinh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x) {
+  return simd_make_float16(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_asinh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x) {
+  return _simd_asinh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x) {
+  return simd_make_double2(asinh(x.x), asinh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_asinh(simd_double3 x) {
+  return simd_make_double3(__tg_asinh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_asinh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x) {
+  return _simd_asinh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x) {
+  return simd_make_double4(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_asinh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x) {
+  return _simd_asinh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x) {
+  return simd_make_double8(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#pragma mark - atanh implementation
+static inline SIMD_CFUNC simd_float2 __tg_atanh(simd_float2 x) {
+  return simd_make_float2(__tg_atanh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atanh(simd_float3 x) {
+  return simd_make_float3(__tg_atanh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atanh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x) {
+  return _simd_atanh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x) {
+  return simd_make_float4(atanh(x.x), atanh(x.y), atanh(x.z), atanh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atanh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x) {
+  return _simd_atanh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x) {
+  return simd_make_float8(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atanh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x) {
+  return _simd_atanh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x) {
+  return simd_make_float16(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atanh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x) {
+  return _simd_atanh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x) {
+  return simd_make_double2(atanh(x.x), atanh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atanh(simd_double3 x) {
+  return simd_make_double3(__tg_atanh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atanh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x) {
+  return _simd_atanh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x) {
+  return simd_make_double4(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atanh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x) {
+  return _simd_atanh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x) {
+  return simd_make_double8(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#pragma mark - cosh implementation
+static inline SIMD_CFUNC simd_float2 __tg_cosh(simd_float2 x) {
+  return simd_make_float2(__tg_cosh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cosh(simd_float3 x) {
+  return simd_make_float3(__tg_cosh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cosh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x) {
+  return _simd_cosh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x) {
+  return simd_make_float4(cosh(x.x), cosh(x.y), cosh(x.z), cosh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cosh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x) {
+  return _simd_cosh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x) {
+  return simd_make_float8(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cosh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x) {
+  return _simd_cosh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x) {
+  return simd_make_float16(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cosh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x) {
+  return _simd_cosh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x) {
+  return simd_make_double2(cosh(x.x), cosh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cosh(simd_double3 x) {
+  return simd_make_double3(__tg_cosh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cosh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x) {
+  return _simd_cosh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x) {
+  return simd_make_double4(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cosh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x) {
+  return _simd_cosh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x) {
+  return simd_make_double8(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#pragma mark - sinh implementation
+static inline SIMD_CFUNC simd_float2 __tg_sinh(simd_float2 x) {
+  return simd_make_float2(__tg_sinh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sinh(simd_float3 x) {
+  return simd_make_float3(__tg_sinh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sinh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x) {
+  return _simd_sinh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x) {
+  return simd_make_float4(sinh(x.x), sinh(x.y), sinh(x.z), sinh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sinh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x) {
+  return _simd_sinh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x) {
+  return simd_make_float8(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sinh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x) {
+  return _simd_sinh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x) {
+  return simd_make_float16(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sinh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x) {
+  return _simd_sinh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x) {
+  return simd_make_double2(sinh(x.x), sinh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sinh(simd_double3 x) {
+  return simd_make_double3(__tg_sinh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sinh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x) {
+  return _simd_sinh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x) {
+  return simd_make_double4(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sinh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x) {
+  return _simd_sinh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x) {
+  return simd_make_double8(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#pragma mark - tanh implementation
+static inline SIMD_CFUNC simd_float2 __tg_tanh(simd_float2 x) {
+  return simd_make_float2(__tg_tanh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tanh(simd_float3 x) {
+  return simd_make_float3(__tg_tanh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tanh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x) {
+  return _simd_tanh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x) {
+  return simd_make_float4(tanh(x.x), tanh(x.y), tanh(x.z), tanh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tanh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x) {
+  return _simd_tanh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x) {
+  return simd_make_float8(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tanh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x) {
+  return _simd_tanh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x) {
+  return simd_make_float16(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tanh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x) {
+  return _simd_tanh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x) {
+  return simd_make_double2(tanh(x.x), tanh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tanh(simd_double3 x) {
+  return simd_make_double3(__tg_tanh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tanh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x) {
+  return _simd_tanh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x) {
+  return simd_make_double4(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tanh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x) {
+  return _simd_tanh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x) {
+  return simd_make_double8(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#pragma mark - exp implementation
+static inline SIMD_CFUNC simd_float2 __tg_exp(simd_float2 x) {
+  return simd_make_float2(__tg_exp(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp(simd_float3 x) {
+  return simd_make_float3(__tg_exp(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x) {
+  return _simd_exp_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x) {
+  return simd_make_float4(exp(x.x), exp(x.y), exp(x.z), exp(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x) {
+  return _simd_exp_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x) {
+  return simd_make_float8(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x) {
+  return _simd_exp_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x) {
+  return simd_make_float16(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x) {
+  return _simd_exp_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x) {
+  return simd_make_double2(exp(x.x), exp(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp(simd_double3 x) {
+  return simd_make_double3(__tg_exp(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x) {
+  return _simd_exp_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x) {
+  return simd_make_double4(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x) {
+  return _simd_exp_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x) {
+  return simd_make_double8(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#pragma mark - exp2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_exp2(simd_float2 x) {
+  return simd_make_float2(__tg_exp2(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp2(simd_float3 x) {
+  return simd_make_float3(__tg_exp2(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp2_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x) {
+  return _simd_exp2_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x) {
+  return simd_make_float4(exp2(x.x), exp2(x.y), exp2(x.z), exp2(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp2_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x) {
+  return _simd_exp2_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x) {
+  return simd_make_float8(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp2_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x) {
+  return _simd_exp2_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x) {
+  return simd_make_float16(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp2_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x) {
+  return _simd_exp2_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x) {
+  return simd_make_double2(exp2(x.x), exp2(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp2(simd_double3 x) {
+  return simd_make_double3(__tg_exp2(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp2_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x) {
+  return _simd_exp2_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x) {
+  return simd_make_double4(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp2_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x) {
+  return _simd_exp2_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x) {
+  return simd_make_double8(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#pragma mark - exp10 implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_exp10(simd_float2 x) {
+  return simd_make_float2(__tg_exp10(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp10(simd_float3 x) {
+  return simd_make_float3(__tg_exp10(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp10_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x) {
+  return _simd_exp10_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x) {
+  return simd_make_float4(__exp10(x.x), __exp10(x.y), __exp10(x.z), __exp10(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp10_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x) {
+  return _simd_exp10_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x) {
+  return simd_make_float8(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp10_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x) {
+  return _simd_exp10_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x) {
+  return simd_make_float16(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp10_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x) {
+  return _simd_exp10_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x) {
+  return simd_make_double2(__exp10(x.x), __exp10(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp10(simd_double3 x) {
+  return simd_make_double3(__tg_exp10(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp10_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x) {
+  return _simd_exp10_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x) {
+  return simd_make_double4(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp10_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x) {
+  return _simd_exp10_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x) {
+  return simd_make_double8(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - expm1 implementation
+static inline SIMD_CFUNC simd_float2 __tg_expm1(simd_float2 x) {
+  return simd_make_float2(__tg_expm1(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_expm1(simd_float3 x) {
+  return simd_make_float3(__tg_expm1(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_expm1_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x) {
+  return _simd_expm1_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x) {
+  return simd_make_float4(expm1(x.x), expm1(x.y), expm1(x.z), expm1(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_expm1_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x) {
+  return _simd_expm1_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x) {
+  return simd_make_float8(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_expm1_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x) {
+  return _simd_expm1_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x) {
+  return simd_make_float16(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_expm1_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x) {
+  return _simd_expm1_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x) {
+  return simd_make_double2(expm1(x.x), expm1(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_expm1(simd_double3 x) {
+  return simd_make_double3(__tg_expm1(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_expm1_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x) {
+  return _simd_expm1_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x) {
+  return simd_make_double4(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_expm1_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x) {
+  return _simd_expm1_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x) {
+  return simd_make_double8(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#pragma mark - log implementation
+static inline SIMD_CFUNC simd_float2 __tg_log(simd_float2 x) {
+  return simd_make_float2(__tg_log(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log(simd_float3 x) {
+  return simd_make_float3(__tg_log(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x) {
+  return _simd_log_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x) {
+  return simd_make_float4(log(x.x), log(x.y), log(x.z), log(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x) {
+  return _simd_log_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x) {
+  return simd_make_float8(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x) {
+  return _simd_log_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x) {
+  return simd_make_float16(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x) {
+  return _simd_log_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x) {
+  return simd_make_double2(log(x.x), log(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log(simd_double3 x) {
+  return simd_make_double3(__tg_log(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x) {
+  return _simd_log_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x) {
+  return simd_make_double4(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x) {
+  return _simd_log_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x) {
+  return simd_make_double8(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#pragma mark - log2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_log2(simd_float2 x) {
+  return simd_make_float2(__tg_log2(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log2(simd_float3 x) {
+  return simd_make_float3(__tg_log2(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log2_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x) {
+  return _simd_log2_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x) {
+  return simd_make_float4(log2(x.x), log2(x.y), log2(x.z), log2(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log2_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x) {
+  return _simd_log2_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x) {
+  return simd_make_float8(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log2_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x) {
+  return _simd_log2_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x) {
+  return simd_make_float16(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log2_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x) {
+  return _simd_log2_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x) {
+  return simd_make_double2(log2(x.x), log2(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log2(simd_double3 x) {
+  return simd_make_double3(__tg_log2(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log2_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x) {
+  return _simd_log2_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x) {
+  return simd_make_double4(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log2_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x) {
+  return _simd_log2_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x) {
+  return simd_make_double8(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#pragma mark - log10 implementation
+static inline SIMD_CFUNC simd_float2 __tg_log10(simd_float2 x) {
+  return simd_make_float2(__tg_log10(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log10(simd_float3 x) {
+  return simd_make_float3(__tg_log10(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log10_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x) {
+  return _simd_log10_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x) {
+  return simd_make_float4(log10(x.x), log10(x.y), log10(x.z), log10(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log10_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x) {
+  return _simd_log10_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x) {
+  return simd_make_float8(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log10_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x) {
+  return _simd_log10_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x) {
+  return simd_make_float16(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log10_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x) {
+  return _simd_log10_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x) {
+  return simd_make_double2(log10(x.x), log10(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log10(simd_double3 x) {
+  return simd_make_double3(__tg_log10(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log10_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x) {
+  return _simd_log10_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x) {
+  return simd_make_double4(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log10_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x) {
+  return _simd_log10_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x) {
+  return simd_make_double8(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#pragma mark - log1p implementation
+static inline SIMD_CFUNC simd_float2 __tg_log1p(simd_float2 x) {
+  return simd_make_float2(__tg_log1p(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log1p(simd_float3 x) {
+  return simd_make_float3(__tg_log1p(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log1p_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x) {
+  return _simd_log1p_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x) {
+  return simd_make_float4(log1p(x.x), log1p(x.y), log1p(x.z), log1p(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log1p_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x) {
+  return _simd_log1p_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x) {
+  return simd_make_float8(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log1p_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x) {
+  return _simd_log1p_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x) {
+  return simd_make_float16(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log1p_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x) {
+  return _simd_log1p_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x) {
+  return simd_make_double2(log1p(x.x), log1p(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log1p(simd_double3 x) {
+  return simd_make_double3(__tg_log1p(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log1p_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x) {
+  return _simd_log1p_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x) {
+  return simd_make_double4(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log1p_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x) {
+  return _simd_log1p_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x) {
+  return simd_make_double8(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#pragma mark - cbrt implementation
+static inline SIMD_CFUNC simd_float2 __tg_cbrt(simd_float2 x) {
+  return simd_make_float2(__tg_cbrt(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cbrt(simd_float3 x) {
+  return simd_make_float3(__tg_cbrt(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cbrt_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x) {
+  return _simd_cbrt_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x) {
+  return simd_make_float4(cbrt(x.x), cbrt(x.y), cbrt(x.z), cbrt(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cbrt_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x) {
+  return _simd_cbrt_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x) {
+  return simd_make_float8(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cbrt_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x) {
+  return _simd_cbrt_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x) {
+  return simd_make_float16(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cbrt_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x) {
+  return _simd_cbrt_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x) {
+  return simd_make_double2(cbrt(x.x), cbrt(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cbrt(simd_double3 x) {
+  return simd_make_double3(__tg_cbrt(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cbrt_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x) {
+  return _simd_cbrt_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x) {
+  return simd_make_double4(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cbrt_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x) {
+  return _simd_cbrt_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x) {
+  return simd_make_double8(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#pragma mark - erf implementation
+static inline SIMD_CFUNC simd_float2 __tg_erf(simd_float2 x) {
+  return simd_make_float2(__tg_erf(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_erf(simd_float3 x) {
+  return simd_make_float3(__tg_erf(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_erf_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x) {
+  return _simd_erf_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x) {
+  return simd_make_float4(erf(x.x), erf(x.y), erf(x.z), erf(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_erf_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x) {
+  return _simd_erf_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x) {
+  return simd_make_float8(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_erf_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x) {
+  return _simd_erf_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x) {
+  return simd_make_float16(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_erf_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x) {
+  return _simd_erf_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x) {
+  return simd_make_double2(erf(x.x), erf(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_erf(simd_double3 x) {
+  return simd_make_double3(__tg_erf(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_erf_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x) {
+  return _simd_erf_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x) {
+  return simd_make_double4(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_erf_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x) {
+  return _simd_erf_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x) {
+  return simd_make_double8(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#pragma mark - erfc implementation
+static inline SIMD_CFUNC simd_float2 __tg_erfc(simd_float2 x) {
+  return simd_make_float2(__tg_erfc(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_erfc(simd_float3 x) {
+  return simd_make_float3(__tg_erfc(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_erfc_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x) {
+  return _simd_erfc_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x) {
+  return simd_make_float4(erfc(x.x), erfc(x.y), erfc(x.z), erfc(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_erfc_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x) {
+  return _simd_erfc_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x) {
+  return simd_make_float8(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_erfc_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x) {
+  return _simd_erfc_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x) {
+  return simd_make_float16(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_erfc_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x) {
+  return _simd_erfc_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x) {
+  return simd_make_double2(erfc(x.x), erfc(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_erfc(simd_double3 x) {
+  return simd_make_double3(__tg_erfc(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_erfc_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x) {
+  return _simd_erfc_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x) {
+  return simd_make_double4(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_erfc_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x) {
+  return _simd_erfc_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x) {
+  return simd_make_double8(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#pragma mark - tgamma implementation
+static inline SIMD_CFUNC simd_float2 __tg_tgamma(simd_float2 x) {
+  return simd_make_float2(__tg_tgamma(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tgamma(simd_float3 x) {
+  return simd_make_float3(__tg_tgamma(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tgamma_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x) {
+  return _simd_tgamma_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x) {
+  return simd_make_float4(tgamma(x.x), tgamma(x.y), tgamma(x.z), tgamma(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tgamma_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x) {
+  return _simd_tgamma_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x) {
+  return simd_make_float8(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tgamma_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x) {
+  return _simd_tgamma_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x) {
+  return simd_make_float16(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tgamma_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x) {
+  return _simd_tgamma_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x) {
+  return simd_make_double2(tgamma(x.x), tgamma(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tgamma(simd_double3 x) {
+  return simd_make_double3(__tg_tgamma(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tgamma_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x) {
+  return _simd_tgamma_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x) {
+  return simd_make_double4(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tgamma_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x) {
+  return _simd_tgamma_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x) {
+  return simd_make_double8(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#pragma mark - round implementation
+static inline SIMD_CFUNC simd_float2 __tg_round(simd_float2 x) {
+  return simd_make_float2(__tg_round(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_round(simd_float3 x) {
+  return simd_make_float3(__tg_round(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_round_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x) {
+#if defined __arm64__
+  return vrndaq_f32(x);
+#else
+  return _simd_round_f4(x);
+#endif
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x) {
+  return simd_make_float4(round(x.x), round(x.y), round(x.z), round(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_round_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x) {
+  return _simd_round_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x) {
+  return simd_make_float8(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_round_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x) {
+  return _simd_round_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x) {
+  return simd_make_float16(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_round_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x) {
+#if defined __arm64__
+  return vrndaq_f64(x);
+#else
+  return _simd_round_d2(x);
+#endif
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x) {
+  return simd_make_double2(round(x.x), round(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_round(simd_double3 x) {
+  return simd_make_double3(__tg_round(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_round_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x) {
+  return _simd_round_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x) {
+  return simd_make_double4(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_round_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x) {
+  return _simd_round_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x) {
+  return simd_make_double8(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#pragma mark - atan2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_atan2(simd_float2 y, simd_float2 x) {
+  return simd_make_float2(__tg_atan2(simd_make_float4(y), simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atan2(simd_float3 y, simd_float3 x) {
+  return simd_make_float3(__tg_atan2(simd_make_float4(y), simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atan2_f4(simd_float4 y, simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x) {
+  return _simd_atan2_f4(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x) {
+  return simd_make_float4(atan2(y.x, x.x), atan2(y.y, x.y), atan2(y.z, x.z), atan2(y.w, x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atan2_f8(simd_float8 y, simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x) {
+  return _simd_atan2_f8(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x) {
+  return simd_make_float8(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atan2_f16(simd_float16 y, simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x) {
+  return _simd_atan2_f16(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x) {
+  return simd_make_float16(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atan2_d2(simd_double2 y, simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x) {
+  return _simd_atan2_d2(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x) {
+  return simd_make_double2(atan2(y.x, x.x), atan2(y.y, x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atan2(simd_double3 y, simd_double3 x) {
+  return simd_make_double3(__tg_atan2(simd_make_double4(y), simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atan2_d4(simd_double4 y, simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x) {
+  return _simd_atan2_d4(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x) {
+  return simd_make_double4(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atan2_d8(simd_double8 y, simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x) {
+  return _simd_atan2_d8(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x) {
+  return simd_make_double8(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#pragma mark - hypot implementation
+static inline SIMD_CFUNC simd_float2 __tg_hypot(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_hypot(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_hypot(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_hypot(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_hypot_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y) {
+  return _simd_hypot_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(hypot(x.x, y.x), hypot(x.y, y.y), hypot(x.z, y.z), hypot(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_hypot_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y) {
+  return _simd_hypot_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_hypot_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y) {
+  return _simd_hypot_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_hypot_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y) {
+  return _simd_hypot_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(hypot(x.x, y.x), hypot(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_hypot(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_hypot(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_hypot_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y) {
+  return _simd_hypot_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_hypot_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y) {
+  return _simd_hypot_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - pow implementation
+static inline SIMD_CFUNC simd_float2 __tg_pow(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_pow(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_pow(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_pow(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_pow_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y) {
+  return _simd_pow_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z), pow(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_pow_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y) {
+  return _simd_pow_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_pow_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y) {
+  return _simd_pow_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_pow_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y) {
+  return _simd_pow_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(pow(x.x, y.x), pow(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_pow(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_pow(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_pow_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y) {
+  return _simd_pow_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_pow_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y) {
+  return _simd_pow_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - fmod implementation
+static inline SIMD_CFUNC simd_float2 __tg_fmod(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_fmod(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_fmod(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_fmod(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_fmod_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y) {
+  return _simd_fmod_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(fmod(x.x, y.x), fmod(x.y, y.y), fmod(x.z, y.z), fmod(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_fmod_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y) {
+  return _simd_fmod_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_fmod_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y) {
+  return _simd_fmod_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_fmod_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y) {
+  return _simd_fmod_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(fmod(x.x, y.x), fmod(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_fmod(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_fmod(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_fmod_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y) {
+  return _simd_fmod_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_fmod_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y) {
+  return _simd_fmod_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - remainder implementation
+static inline SIMD_CFUNC simd_float2 __tg_remainder(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_remainder(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_remainder(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_remainder(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_remainder_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y) {
+  return _simd_remainder_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(remainder(x.x, y.x), remainder(x.y, y.y), remainder(x.z, y.z), remainder(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_remainder_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y) {
+  return _simd_remainder_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_remainder_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y) {
+  return _simd_remainder_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_remainder_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y) {
+  return _simd_remainder_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(remainder(x.x, y.x), remainder(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_remainder(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_remainder(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_remainder_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y) {
+  return _simd_remainder_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_remainder_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y) {
+  return _simd_remainder_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - nextafter implementation
+static inline SIMD_CFUNC simd_float2 __tg_nextafter(simd_float2 x, simd_float2 y) {
+  return simd_make_float2(__tg_nextafter(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_nextafter(simd_float3 x, simd_float3 y) {
+  return simd_make_float3(__tg_nextafter(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_nextafter_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y) {
+  return _simd_nextafter_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y) {
+  return simd_make_float4(nextafter(x.x, y.x), nextafter(x.y, y.y), nextafter(x.z, y.z), nextafter(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_nextafter_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y) {
+  return _simd_nextafter_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y) {
+  return simd_make_float8(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_nextafter_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y) {
+  return _simd_nextafter_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y) {
+  return simd_make_float16(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_nextafter_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y) {
+  return _simd_nextafter_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y) {
+  return simd_make_double2(nextafter(x.x, y.x), nextafter(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_nextafter(simd_double3 x, simd_double3 y) {
+  return simd_make_double3(__tg_nextafter(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_nextafter_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y) {
+  return _simd_nextafter_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y) {
+  return simd_make_double4(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_nextafter_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y) {
+  return _simd_nextafter_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y) {
+  return simd_make_double8(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float2 __tg_fdim(simd_float2 x, simd_float2 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float3 __tg_fdim(simd_float3 x, simd_float3 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float4 __tg_fdim(simd_float4 x, simd_float4 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float8 __tg_fdim(simd_float8 x, simd_float8 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float16 __tg_fdim(simd_float16 x, simd_float16 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double2 __tg_fdim(simd_double2 x, simd_double2 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double3 __tg_fdim(simd_double3 x, simd_double3 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double4 __tg_fdim(simd_double4 x, simd_double4 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double8 __tg_fdim(simd_double8 x, simd_double8 y) { return simd_bitselect(x-y, 0, x<y); }
+ 
+static inline SIMD_CFUNC simd_float2 __tg_fma(simd_float2 x, simd_float2 y, simd_float2 z) {
+#if defined __arm64__ || defined __ARM_VFPV4__
+  return vfma_f32(z, x, y);
+#else
+  return simd_make_float2(__tg_fma(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_float4_undef(z)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_fma(simd_float3 x, simd_float3 y, simd_float3 z) {
+  return simd_make_float3(__tg_fma(simd_make_float4(x), simd_make_float4(y), simd_make_float4(z)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_fma_f4(simd_float4 x, simd_float4 y, simd_float4 z);
+#endif
+static inline SIMD_CFUNC simd_float4 __tg_fma(simd_float4 x, simd_float4 y, simd_float4 z) {
+#if defined __arm64__ || defined __ARM_VFPV4__
+  return vfmaq_f32(z, x, y);
+#elif (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm_fmadd_ps(x, y, z);
+#elif SIMD_LIBRARY_VERSION >= 3
+  return _simd_fma_f4(x, y, z);
+#else
+  return simd_make_float4(fma(x.x, y.x, z.x), fma(x.y, y.y, z.y), fma(x.z, y.z, z.z), fma(x.w, y.w, z.w));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 __tg_fma(simd_float8 x, simd_float8 y, simd_float8 z) {
+#if (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm256_fmadd_ps(x, y, z);
+#else
+  return simd_make_float8(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 __tg_fma(simd_float16 x, simd_float16 y, simd_float16 z) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_fmadd_ps(x, y, z);
+#else
+  return simd_make_float16(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_fma_d2(simd_double2 x, simd_double2 y, simd_double2 z);
+#endif
+static inline SIMD_CFUNC simd_double2 __tg_fma(simd_double2 x, simd_double2 y, simd_double2 z) {
+#if defined __arm64__
+  return vfmaq_f64(z, x, y);
+#elif (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm_fmadd_pd(x, y, z);
+#elif SIMD_LIBRARY_VERSION >= 3
+  return _simd_fma_d2(x, y, z);
+#else
+  return simd_make_double2(fma(x.x, y.x, z.x), fma(x.y, y.y, z.y));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 __tg_fma(simd_double3 x, simd_double3 y, simd_double3 z) {
+  return simd_make_double3(__tg_fma(simd_make_double4(x), simd_make_double4(y), simd_make_double4(z)));
+}
+
+static inline SIMD_CFUNC simd_double4 __tg_fma(simd_double4 x, simd_double4 y, simd_double4 z) {
+#if (defined __i386__ || defined __x86_64__) && defined __FMA__
+  return _mm256_fmadd_pd(x, y, z);
+#else
+  return simd_make_double4(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 __tg_fma(simd_double8 x, simd_double8 y, simd_double8 z) {
+#if defined __x86_64__ && defined __AVX512F__
+  return _mm512_fmadd_pd(x, y, z);
+#else
+  return simd_make_double8(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC float simd_muladd(float x, float y, float z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float2 simd_muladd(simd_float2 x, simd_float2 y, simd_float2 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float3 simd_muladd(simd_float3 x, simd_float3 y, simd_float3 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float4 simd_muladd(simd_float4 x, simd_float4 y, simd_float4 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float8 simd_muladd(simd_float8 x, simd_float8 y, simd_float8 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_float16 simd_muladd(simd_float16 x, simd_float16 y, simd_float16 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC double simd_muladd(double x, double y, double z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double2 simd_muladd(simd_double2 x, simd_double2 y, simd_double2 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double3 simd_muladd(simd_double3 x, simd_double3 y, simd_double3 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double4 simd_muladd(simd_double4 x, simd_double4 y, simd_double4 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+static inline SIMD_CFUNC simd_double8 simd_muladd(simd_double8 x, simd_double8 y, simd_double8 z) {
+#pragma STDC FP_CONTRACT ON
+  return x*y + z;
+}
+#ifdef __cplusplus
+}      /* extern "C" */
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_MATH_HEADER */
+\ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/packed.h b/lib/libc/include/aarch64-macos-gnu/simd/packed.h
new file mode 100644
index 0000000000..ddbd861090
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/packed.h
@@ -0,0 +1,1031 @@
+/*! @header
+ *  This header defines fixed size vector types with relaxed alignment. For 
+ *  each vector type defined by <simd/vector_types.h> that is not a 1- or 3-
+ *  element vector, there is a corresponding type defined by this header that
+ *  requires only the alignment matching that of the underlying scalar type.
+ *
+ *  These types should be used to access buffers that may not be sufficiently
+ *  aligned to allow them to be accessed using the "normal" simd vector types.
+ *  As an example of this usage, suppose that you want to load a vector of
+ *  four floats from an array of floats. The type simd_float4 has sixteen byte
+ *  alignment, whereas an array of floats has only four byte alignment.
+ *  Thus, naively casting a pointer into the array to (simd_float4 *) would 
+ *  invoke undefined behavior, and likely produce an alignment fault at
+ *  runtime. Instead, use the corresponding packed type to load from the array:
+ *
+ *  <pre>
+ *  @textblock
+ *  simd_float4 vector = *(packed_simd_float4 *)&array[i];
+ *  // do something with vector ...
+ *  @/textblock
+ *  </pre>
+ *
+ *  It's important to note that the packed_ types are only needed to work with
+ *  memory; once the data is loaded, we simply operate on it as usual using
+ *  the simd_float4 type, as illustrated above.
+ *
+ *  @copyright 2014-2017 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_PACKED_TYPES
+#define SIMD_PACKED_TYPES
+
+# include <simd/vector_types.h>
+# if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::char2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(1))) char simd_packed_char2;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::char4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(1))) char simd_packed_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(1))) char simd_packed_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char16.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) char simd_packed_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char32.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) char simd_packed_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::char64.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) char simd_packed_char64;
+
+/*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uchar2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(1))) unsigned char simd_packed_uchar2;
+
+/*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uchar4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(1))) unsigned char simd_packed_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::uchar8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(1))) unsigned char simd_packed_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::uchar16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) unsigned char simd_packed_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::uchar32. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) unsigned char simd_packed_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::uchar64. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) unsigned char simd_packed_uchar64;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::short2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(2))) short simd_packed_short2;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::short4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) short simd_packed_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::short8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(2))) short simd_packed_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::short16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(2))) short simd_packed_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ *  integers with relaxed alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::short32. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(2))) short simd_packed_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ushort2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(2))) unsigned short simd_packed_ushort2;
+
+/*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ushort4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) unsigned short simd_packed_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::ushort8. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(2))) unsigned short simd_packed_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::ushort16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(2))) unsigned short simd_packed_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::ushort32. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(2))) unsigned short simd_packed_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::int2. The alignment of this type is that of the underlying
+ *  scalar element type, so you can use it to load or store from an array of
+ *  that type.                                                                */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) int simd_packed_int2;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::int4. The alignment of this type is that of the underlying
+ *  scalar element type, so you can use it to load or store from an array of
+ *  that type.                                                                */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) int simd_packed_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::int8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) int simd_packed_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::int16.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) int simd_packed_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uint2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) unsigned int simd_packed_uint2;
+
+/*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::uint4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) unsigned int simd_packed_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::uint8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) unsigned int simd_packed_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::uint16.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) unsigned int simd_packed_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::float2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) float simd_packed_float2;
+
+/*! @abstract A vector of four 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::float4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) float simd_packed_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::float8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) float simd_packed_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::float16. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) float simd_packed_float16;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::long2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) simd_long1 simd_packed_long2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) simd_long1 simd_packed_long2;
+#endif
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::long4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) simd_long1 simd_packed_long4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) simd_long1 simd_packed_long4;
+#endif
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description In C++ this type is also available as simd::packed::long8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) simd_long1 simd_packed_long8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) simd_long1 simd_packed_long8;
+#endif
+
+/*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ulong2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) simd_ulong1 simd_packed_ulong2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) simd_ulong1 simd_packed_ulong2;
+#endif
+
+/*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::ulong4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) simd_ulong1 simd_packed_ulong4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) simd_ulong1 simd_packed_ulong4;
+#endif
+
+/*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as simd::packed::ulong8.
+ *  This type is not available in Metal. The alignment of this type is only
+ *  that of the underlying scalar element type, so you can use it to load or
+ *  store from an array of that type.                                         */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) simd_ulong1 simd_packed_ulong8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) simd_ulong1 simd_packed_ulong8;
+#endif
+
+/*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::double2. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) double simd_packed_double2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) double simd_packed_double2;
+#endif
+
+/*! @abstract A vector of four 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ and Metal, this type is also available as
+ *  simd::packed::double4. The alignment of this type is that of the
+ *  underlying scalar element type, so you can use it to load or store from
+ *  an array of that type.                                                    */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) double simd_packed_double4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) double simd_packed_double4;
+#endif
+
+/*! @abstract A vector of eight 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description In C++ this type is also available as
+ *  simd::packed::double8. This type is not available in Metal. The
+ *  alignment of this type is only that of the underlying scalar element
+ *  type, so you can use it to load or store from an array of that type.      */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) double simd_packed_double8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) double simd_packed_double8;
+#endif
+
+/*  MARK: C++ vector types                                                    */
+#if defined __cplusplus
+namespace simd {
+  namespace packed {
+    /*! @abstract A vector of two 8-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_char2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_char2 char2;
+  
+    /*! @abstract A vector of four 8-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_char4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_char4 char4;
+  
+    /*! @abstract A vector of eight 8-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char8 char8;
+  
+    /*! @abstract A vector of sixteen 8-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char16 char16;
+  
+    /*! @abstract A vector of thirty-two 8-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char32 char32;
+  
+    /*! @abstract A vector of sixty-four 8-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_char64. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_char64 char64;
+  
+    /*! @abstract A vector of two 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uchar2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uchar2 uchar2;
+  
+    /*! @abstract A vector of four 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uchar4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uchar4 uchar4;
+  
+    /*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar8 uchar8;
+  
+    /*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar16 uchar16;
+  
+    /*! @abstract A vector of thirty-two 8-bit unsigned integers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar32 uchar32;
+  
+    /*! @abstract A vector of sixty-four 8-bit unsigned integers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uchar64. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uchar64 uchar64;
+  
+    /*! @abstract A vector of two 16-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_short2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_short2 short2;
+  
+    /*! @abstract A vector of four 16-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_short4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_short4 short4;
+  
+    /*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_short8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_short8 short8;
+  
+    /*! @abstract A vector of sixteen 16-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_short16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_short16 short16;
+  
+    /*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_short32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_short32 short32;
+  
+    /*! @abstract A vector of two 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ushort2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ushort2 ushort2;
+  
+    /*! @abstract A vector of four 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ushort4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ushort4 ushort4;
+  
+    /*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ushort8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ushort8 ushort8;
+  
+    /*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ushort16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ushort16 ushort16;
+  
+    /*! @abstract A vector of thirty-two 16-bit unsigned integers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ushort32. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ushort32 ushort32;
+  
+    /*! @abstract A vector of two 32-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_int2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_int2 int2;
+  
+    /*! @abstract A vector of four 32-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_int4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_int4 int4;
+  
+    /*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_int8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_int8 int8;
+  
+    /*! @abstract A vector of sixteen 32-bit signed (twos-complement)
+     *  integers with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_int16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_int16 int16;
+  
+    /*! @abstract A vector of two 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uint2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uint2 uint2;
+  
+    /*! @abstract A vector of four 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_uint4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_uint4 uint4;
+  
+    /*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uint8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uint8 uint8;
+  
+    /*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_uint16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_uint16 uint16;
+  
+    /*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_float2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_float2 float2;
+  
+    /*! @abstract A vector of four 32-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_float4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_float4 float4;
+  
+    /*! @abstract A vector of eight 32-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_float8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_float8 float8;
+  
+    /*! @abstract A vector of sixteen 32-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_float16. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_float16 float16;
+  
+    /*! @abstract A vector of two 64-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_long2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_long2 long2;
+  
+    /*! @abstract A vector of four 64-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_long4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_long4 long4;
+  
+    /*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+     *  with relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_long8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_long8 long8;
+  
+    /*! @abstract A vector of two 64-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ulong2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ulong2 ulong2;
+  
+    /*! @abstract A vector of four 64-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_ulong4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_ulong4 ulong4;
+  
+    /*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+     *  alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_ulong8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_ulong8 ulong8;
+  
+    /*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+     *  alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_double2. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_double2 double2;
+  
+    /*! @abstract A vector of four 64-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description In C or Objective-C, this type is available as
+     *  simd_packed_double4. The alignment of this type is only that of the
+     *  underlying scalar element type, so you can use it to load or store
+     *  from an array of that type.                                           */
+typedef ::simd_packed_double4 double4;
+  
+    /*! @abstract A vector of eight 64-bit floating-point numbers with
+     *  relaxed alignment.
+     *  @description This type is not available in Metal. In C or
+     *  Objective-C, this type is available as simd_packed_double8. The
+     *  alignment of this type is only that of the underlying scalar element
+     *  type, so you can use it to load or store from an array of that type.  */
+typedef ::simd_packed_double8 double8;
+  
+  } /* namespace simd::packed::                                               */
+} /* namespace simd::                                                         */
+#endif /* __cplusplus                                                         */
+
+/*  MARK: Deprecated vector types                                             */
+/*! @group Deprecated vector types
+ *  @discussion These are the original types used by earlier versions of the
+ *  simd library; they are provided here for compatability with existing source
+ *  files. Use the new ("simd_"-prefixed) types for future development.       */
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char2
+ *  or simd::packed::char2 instead.                                           */
+typedef simd_packed_char2 packed_char2;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char4
+ *  or simd::packed::char4 instead.                                           */
+typedef simd_packed_char4 packed_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char8
+ *  or simd::packed::char8 instead.                                           */
+typedef simd_packed_char8 packed_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char16
+ *  or simd::packed::char16 instead.                                          */
+typedef simd_packed_char16 packed_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char32
+ *  or simd::packed::char32 instead.                                          */
+typedef simd_packed_char32 packed_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_char64
+ *  or simd::packed::char64 instead.                                          */
+typedef simd_packed_char64 packed_char64;
+
+/*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar2
+ *  or simd::packed::uchar2 instead.                                          */
+typedef simd_packed_uchar2 packed_uchar2;
+
+/*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar4
+ *  or simd::packed::uchar4 instead.                                          */
+typedef simd_packed_uchar4 packed_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar8
+ *  or simd::packed::uchar8 instead.                                          */
+typedef simd_packed_uchar8 packed_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar16
+ *  or simd::packed::uchar16 instead.                                         */
+typedef simd_packed_uchar16 packed_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar32
+ *  or simd::packed::uchar32 instead.                                         */
+typedef simd_packed_uchar32 packed_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uchar64
+ *  or simd::packed::uchar64 instead.                                         */
+typedef simd_packed_uchar64 packed_uchar64;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short2
+ *  or simd::packed::short2 instead.                                          */
+typedef simd_packed_short2 packed_short2;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short4
+ *  or simd::packed::short4 instead.                                          */
+typedef simd_packed_short4 packed_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short8
+ *  or simd::packed::short8 instead.                                          */
+typedef simd_packed_short8 packed_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short16
+ *  or simd::packed::short16 instead.                                         */
+typedef simd_packed_short16 packed_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ *  integers with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_short32
+ *  or simd::packed::short32 instead.                                         */
+typedef simd_packed_short32 packed_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ushort2
+ *  or simd::packed::ushort2 instead.                                         */
+typedef simd_packed_ushort2 packed_ushort2;
+
+/*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ushort4
+ *  or simd::packed::ushort4 instead.                                         */
+typedef simd_packed_ushort4 packed_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ushort8
+ *  or simd::packed::ushort8 instead.                                         */
+typedef simd_packed_ushort8 packed_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use
+ *  simd_packed_ushort16 or simd::packed::ushort16 instead.                   */
+typedef simd_packed_ushort16 packed_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use
+ *  simd_packed_ushort32 or simd::packed::ushort32 instead.                   */
+typedef simd_packed_ushort32 packed_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int2 or
+ *  simd::packed::int2 instead.                                               */
+typedef simd_packed_int2 packed_int2;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int4 or
+ *  simd::packed::int4 instead.                                               */
+typedef simd_packed_int4 packed_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int8 or
+ *  simd::packed::int8 instead.                                               */
+typedef simd_packed_int8 packed_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_int16
+ *  or simd::packed::int16 instead.                                           */
+typedef simd_packed_int16 packed_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint2
+ *  or simd::packed::uint2 instead.                                           */
+typedef simd_packed_uint2 packed_uint2;
+
+/*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint4
+ *  or simd::packed::uint4 instead.                                           */
+typedef simd_packed_uint4 packed_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint8
+ *  or simd::packed::uint8 instead.                                           */
+typedef simd_packed_uint8 packed_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_uint16
+ *  or simd::packed::uint16 instead.                                          */
+typedef simd_packed_uint16 packed_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float2
+ *  or simd::packed::float2 instead.                                          */
+typedef simd_packed_float2 packed_float2;
+
+/*! @abstract A vector of four 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float4
+ *  or simd::packed::float4 instead.                                          */
+typedef simd_packed_float4 packed_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float8
+ *  or simd::packed::float8 instead.                                          */
+typedef simd_packed_float8 packed_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_float16
+ *  or simd::packed::float16 instead.                                         */
+typedef simd_packed_float16 packed_float16;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_long2
+ *  or simd::packed::long2 instead.                                           */
+typedef simd_packed_long2 packed_long2;
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers with
+ *  relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_long4
+ *  or simd::packed::long4 instead.                                           */
+typedef simd_packed_long4 packed_long4;
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ *  with relaxed alignment.
+ *  @description This type is deprecated; you should use simd_packed_long8
+ *  or simd::packed::long8 instead.                                           */
+typedef simd_packed_long8 packed_long8;
+
+/*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ulong2
+ *  or simd::packed::ulong2 instead.                                          */
+typedef simd_packed_ulong2 packed_ulong2;
+
+/*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ulong4
+ *  or simd::packed::ulong4 instead.                                          */
+typedef simd_packed_ulong4 packed_ulong4;
+
+/*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_ulong8
+ *  or simd::packed::ulong8 instead.                                          */
+typedef simd_packed_ulong8 packed_ulong8;
+
+/*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_double2
+ *  or simd::packed::double2 instead.                                         */
+typedef simd_packed_double2 packed_double2;
+
+/*! @abstract A vector of four 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_double4
+ *  or simd::packed::double4 instead.                                         */
+typedef simd_packed_double4 packed_double4;
+
+/*! @abstract A vector of eight 64-bit floating-point numbers with relaxed
+ *  alignment.
+ *  @description This type is deprecated; you should use simd_packed_double8
+ *  or simd::packed::double8 instead.                                         */
+typedef simd_packed_double8 packed_double8;
+
+# endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif
+\ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/quaternion.h b/lib/libc/include/aarch64-macos-gnu/simd/quaternion.h
new file mode 100644
index 0000000000..b7c5e2909d
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/quaternion.h
@@ -0,0 +1,1194 @@
+/*! @header
+ *  This header defines functions for constructing and using quaternions.
+ *  @copyright 2015-2016 Apple, Inc. All rights reserved.
+ *  @unsorted                                                                 */
+
+#ifndef SIMD_QUATERNIONS
+#define SIMD_QUATERNIONS
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector.h>
+#include <simd/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  
+/*  MARK: - C and Objective-C float interfaces                                */
+
+/*! @abstract Constructs a quaternion from four scalar values.
+ *
+ *  @param ix The first component of the imaginary (vector) part.
+ *  @param iy The second component of the imaginary (vector) part.
+ *  @param iz The third component of the imaginary (vector) part.
+ *
+ *  @param r The real (scalar) part.                                          */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float ix, float iy, float iz, float r) {
+  return (simd_quatf){ { ix, iy, iz, r } };
+}
+  
+/*! @abstract Constructs a quaternion from an array of four scalars.
+ *
+ *  @discussion Note that the imaginary part of the quaternion comes from 
+ *  array elements 0, 1, and 2, and the real part comes from element 3.       */
+static inline SIMD_NONCONST simd_quatf simd_quaternion(const float xyzr[4]) {
+  return (simd_quatf){ *(const simd_packed_float4 *)xyzr };
+}
+  
+/*! @abstract Constructs a quaternion from a four-element vector.
+ *
+ *  @discussion Note that the imaginary (vector) part of the quaternion comes
+ *  from lanes 0, 1, and 2 of the vector, and the real (scalar) part comes from
+ *  lane 3.                                                                   */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(simd_float4 xyzr) {
+  return (simd_quatf){ xyzr };
+}
+  
+/*! @abstract Constructs a quaternion that rotates by `angle` radians about
+ *  `axis`.                                                                   */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float angle, simd_float3 axis);
+  
+/*! @abstract Construct a quaternion that rotates from one vector to another.
+ *
+ *  @param from A normalized three-element vector.
+ *  @param to A normalized three-element vector.
+ *
+ *  @discussion The rotation axis is `simd_cross(from, to)`. If `from` and
+ *  `to` point in opposite directions (to within machine precision), an
+ *  arbitrary rotation axis is chosen, and the angle is pi radians.           */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3 from, simd_float3 to);
+
+/*! @abstract Construct a quaternion from a 3x3 rotation `matrix`.
+ *
+ *  @discussion If `matrix` is not orthogonal with determinant 1, the result
+ *  is undefined.                                                             */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3x3 matrix);
+
+/*! @abstract Construct a quaternion from a 4x4 rotation `matrix`.
+ *
+ *  @discussion The last row and column of the matrix are ignored. This
+ *  function is equivalent to calling simd_quaternion with the upper-left 3x3
+ *  submatrix                .                                                */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float4x4 matrix);
+  
+/*! @abstract The real (scalar) part of the quaternion `q`.                   */
+static inline SIMD_CFUNC float simd_real(simd_quatf q) {
+  return q.vector.w;
+}
+  
+/*! @abstract The imaginary (vector) part of the quaternion `q`.              */
+static inline SIMD_CFUNC simd_float3 simd_imag(simd_quatf q) {
+  return q.vector.xyz;
+}
+  
+/*! @abstract The angle (in radians) of rotation represented by `q`.          */
+static inline SIMD_CFUNC float simd_angle(simd_quatf q);
+  
+/*! @abstract The normalized axis (a 3-element vector) around which the
+ *  action of the quaternion `q` rotates.                                     */
+static inline SIMD_CFUNC simd_float3 simd_axis(simd_quatf q);
+  
+/*! @abstract The sum of the quaternions `p` and `q`.                         */
+static inline SIMD_CFUNC simd_quatf simd_add(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The difference of the quaternions `p` and `q`.                  */
+static inline SIMD_CFUNC simd_quatf simd_sub(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The product of the quaternions `p` and `q`.                     */
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf q, float a);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatf simd_mul(float a, simd_quatf q);
+  
+/*! @abstract The conjugate of the quaternion `q`.                            */
+static inline SIMD_CFUNC simd_quatf simd_conjugate(simd_quatf q);
+  
+/*! @abstract The (multiplicative) inverse of the quaternion `q`.             */
+static inline SIMD_CFUNC simd_quatf simd_inverse(simd_quatf q);
+  
+/*! @abstract The negation (additive inverse) of the quaternion `q`.          */
+static inline SIMD_CFUNC simd_quatf simd_negate(simd_quatf q);
+  
+/*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ *  four-dimensional vectors.                                                 */
+static inline SIMD_CFUNC float simd_dot(simd_quatf p, simd_quatf q);
+  
+/*! @abstract The length of the quaternion `q`.                               */
+static inline SIMD_CFUNC float simd_length(simd_quatf q);
+  
+/*! @abstract The unit quaternion obtained by normalizing `q`.                */
+static inline SIMD_CFUNC simd_quatf simd_normalize(simd_quatf q);
+  
+/*! @abstract Rotates the vector `v` by the quaternion `q`.                   */
+static inline SIMD_CFUNC simd_float3 simd_act(simd_quatf q, simd_float3 v);
+  
+/*! @abstract Logarithm of the quaternion `q`.
+ *  @discussion Do not call this function directly; use `log(q)` instead.
+ *
+ *  We can write a quaternion `q` in the form: `r(cos(t) + sin(t)v)` where
+ *  `r` is the length of `q`, `t` is an angle, and `v` is a unit 3-vector.
+ *  The logarithm of `q` is `log(r) + tv`, just like the logarithm of the
+ *  complex number `r*(cos(t) + i sin(t))` is `log(r) + it`.
+ *
+ *  Note that this function is not robust against poorly-scaled non-unit
+ *  quaternions, because it is primarily used for spline interpolation of
+ *  unit quaternions. If you need to compute a robust logarithm of general
+ *  quaternions, you can use the following approach:
+ *
+ *    scale = simd_reduce_max(simd_abs(q.vector));
+ *    logq = log(simd_recip(scale)*q);
+ *    logq.real += log(scale);
+ *    return logq;                                                            */
+static SIMD_NOINLINE simd_quatf __tg_log(simd_quatf q);
+    
+/*! @abstract Inverse of `log( )`; the exponential map on quaternions.
+ *  @discussion Do not call this function directly; use `exp(q)` instead.     */
+static SIMD_NOINLINE simd_quatf __tg_exp(simd_quatf q);
+  
+/*! @abstract Spherical linear interpolation along the shortest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatf simd_slerp(simd_quatf q0, simd_quatf q1, float t);
+
+/*! @abstract Spherical linear interpolation along the longest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatf simd_slerp_longest(simd_quatf q0, simd_quatf q1, float t);
+
+/*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ *  @discussion The function interpolates between q1 and q2. q0 is the left
+ *  endpoint of the previous interval, and q3 is the right endpoint of the next
+ *  interval. Use this function to smoothly interpolate between a sequence of
+ *  rotations.                                                                */
+static SIMD_NOINLINE simd_quatf simd_spline(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t);
+
+/*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ *  @discussion The function treats q0 ... q3 as control points and uses slerp
+ *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+ *  interpolation are thus q0 and q3, and the curve will not generally pass
+ *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+ *  curve does not hold on the sphere.                                        */
+static SIMD_NOINLINE simd_quatf simd_bezier(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t);
+  
+#ifdef __cplusplus
+} /* extern "C" */
+/*  MARK: - C++ float interfaces                                              */
+
+namespace simd {
+  struct quatf : ::simd_quatf {
+    /*! @abstract The identity quaternion.                                    */
+    quatf( ) : ::simd_quatf(::simd_quaternion((float4){0,0,0,1})) { }
+    
+    /*! @abstract Constructs a C++ quaternion from a C quaternion.            */
+    quatf(::simd_quatf q) : ::simd_quatf(q) { }
+    
+    /*! @abstract Constructs a quaternion from components.                    */
+    quatf(float ix, float iy, float iz, float r) : ::simd_quatf(::simd_quaternion(ix, iy, iz, r)) { }
+    
+    /*! @abstract Constructs a quaternion from an array of scalars.           */
+    quatf(const float xyzr[4]) : ::simd_quatf(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Constructs a quaternion from a vector.                      */
+    quatf(float4 xyzr) : ::simd_quatf(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Quaternion representing rotation about `axis` by `angle` 
+     *  radians.                                                              */
+    quatf(float angle, float3 axis) : ::simd_quatf(::simd_quaternion(angle, axis)) { }
+    
+    /*! @abstract Quaternion that rotates `from` into `to`.                   */
+    quatf(float3 from, float3 to) : ::simd_quatf(::simd_quaternion(from, to)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatf(::simd_float3x3 matrix) : ::simd_quatf(::simd_quaternion(matrix)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatf(::simd_float4x4 matrix) : ::simd_quatf(::simd_quaternion(matrix)) { }
+  
+    /*! @abstract The real (scalar) part of the quaternion.                   */
+    float real(void) const { return ::simd_real(*this); }
+    
+    /*! @abstract The imaginary (vector) part of the quaternion.              */
+    float3 imag(void) const { return ::simd_imag(*this); }
+    
+    /*! @abstract The angle the quaternion rotates by.                        */
+    float angle(void) const { return ::simd_angle(*this); }
+    
+    /*! @abstract The axis the quaternion rotates about.                      */
+    float3 axis(void) const { return ::simd_axis(*this); }
+    
+    /*! @abstract The length of the quaternion.                               */
+    float length(void) const { return ::simd_length(*this); }
+    
+    /*! @abstract Act on the vector `v` by rotation.                          */
+    float3  operator()(const ::simd_float3 v) const { return ::simd_act(*this, v); }
+  };
+  
+  static SIMD_CPPFUNC quatf operator+(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_add(p, q); }
+  static SIMD_CPPFUNC quatf operator-(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_sub(p, q); }
+  static SIMD_CPPFUNC quatf operator-(const ::simd_quatf p) { return ::simd_negate(p); }
+  static SIMD_CPPFUNC quatf operator*(const float r, const ::simd_quatf p) { return ::simd_mul(r, p); }
+  static SIMD_CPPFUNC quatf operator*(const ::simd_quatf p, const float r) { return ::simd_mul(p, r); }
+  static SIMD_CPPFUNC quatf operator*(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_mul(p, q); }
+  static SIMD_CPPFUNC quatf operator/(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_mul(p, ::simd_inverse(q)); }
+  static SIMD_CPPFUNC quatf operator+=(quatf &p, const ::simd_quatf q) { return p = p+q; }
+  static SIMD_CPPFUNC quatf operator-=(quatf &p, const ::simd_quatf q) { return p = p-q; }
+  static SIMD_CPPFUNC quatf operator*=(quatf &p, const float r) { return p = p*r; }
+  static SIMD_CPPFUNC quatf operator*=(quatf &p, const ::simd_quatf q) { return p = p*q; }
+  static SIMD_CPPFUNC quatf operator/=(quatf &p, const ::simd_quatf q) { return p = p/q; }
+  
+  /*! @abstract The conjugate of the quaternion `q`.                          */
+  static SIMD_CPPFUNC quatf conjugate(const ::simd_quatf p) { return ::simd_conjugate(p); }
+  
+  /*! @abstract The (multiplicative) inverse of the quaternion `q`.           */
+  static SIMD_CPPFUNC quatf inverse(const ::simd_quatf p) { return ::simd_inverse(p); }
+
+  /*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+   *  four-dimensional vectors.                                               */
+  static SIMD_CPPFUNC float dot(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_dot(p, q); }
+  
+  /*! @abstract The unit quaternion obtained by normalizing `q`.              */
+  static SIMD_CPPFUNC quatf normalize(const ::simd_quatf p) { return ::simd_normalize(p); }
+
+  /*! @abstract logarithm of the quaternion `q`.                              */
+  static SIMD_CPPFUNC quatf log(const ::simd_quatf q) { return ::__tg_log(q); }
+
+  /*! @abstract exponential map of quaterion `q`.                             */
+  static SIMD_CPPFUNC quatf exp(const ::simd_quatf q) { return ::__tg_exp(q); }
+  
+  /*! @abstract Spherical linear interpolation along the shortest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatf slerp(const ::simd_quatf p0, const ::simd_quatf p1, float t) { return ::simd_slerp(p0, p1, t); }
+  
+  /*! @abstract Spherical linear interpolation along the longest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatf slerp_longest(const ::simd_quatf p0, const ::simd_quatf p1, float t) { return ::simd_slerp_longest(p0, p1, t); }
+  
+  /*! @abstract Interpolate between quaternions along a spherical cubic spline.
+   *
+   *  @discussion The function interpolates between q1 and q2. q0 is the left
+   *  endpoint of the previous interval, and q3 is the right endpoint of the next
+   *  interval. Use this function to smoothly interpolate between a sequence of
+   *  rotations.                                                              */
+  static SIMD_CPPFUNC quatf spline(const ::simd_quatf p0, const ::simd_quatf p1, const ::simd_quatf p2, const ::simd_quatf p3, float t) { return ::simd_spline(p0, p1, p2, p3, t); }
+  
+  /*! @abstract Spherical cubic Bezier interpolation between quaternions.
+   *
+   *  @discussion The function treats q0 ... q3 as control points and uses slerp
+   *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+   *  interpolation are thus q0 and q3, and the curve will not generally pass
+   *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+   *  curve does not hold on the sphere.                                      */
+  static SIMD_CPPFUNC quatf bezier(const ::simd_quatf p0, const ::simd_quatf p1, const ::simd_quatf p2, const ::simd_quatf p3, float t) { return ::simd_bezier(p0, p1, p2, p3, t); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+  
+/*  MARK: - float implementations                                             */
+
+#include <simd/math.h>
+#include <simd/geometry.h>
+  
+/*  tg_promote is implementation gobbledygook that enables the compile-time
+ *  dispatching in tgmath.h to work its magic.                                */
+static simd_quatf __attribute__((__overloadable__)) __tg_promote(simd_quatf);
+  
+/*! @abstract Constructs a quaternion from imaginary and real parts.
+ *  @discussion This function is hidden behind an underscore to avoid confusion
+ *  with the angle-axis constructor.                                          */
+static inline SIMD_CFUNC simd_quatf _simd_quaternion(simd_float3 imag, float real) {
+  return simd_quaternion(simd_make_float4(imag, real));
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float angle, simd_float3 axis) {
+  return _simd_quaternion(sin(angle/2) * axis, cos(angle/2));
+}
+  
+static inline SIMD_CFUNC float simd_angle(simd_quatf q) {
+  return 2*atan2(simd_length(q.vector.xyz), q.vector.w);
+}
+  
+static inline SIMD_CFUNC simd_float3 simd_axis(simd_quatf q) {
+  return simd_normalize(q.vector.xyz);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_add(simd_quatf p, simd_quatf q) {
+  return simd_quaternion(p.vector + q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_sub(simd_quatf p, simd_quatf q) {
+  return simd_quaternion(p.vector - q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf p, simd_quatf q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion((p.vector.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                          p.vector.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5)) +
+                         (p.vector.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6) +
+                          p.vector.w * q.vector));
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf q, float a) {
+  return simd_quaternion(a * q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(float a, simd_quatf q) {
+  return simd_mul(q,a);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_conjugate(simd_quatf q) {
+  return simd_quaternion(q.vector * (simd_float4){-1,-1,-1, 1});
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_inverse(simd_quatf q) {
+  return simd_quaternion(simd_conjugate(q).vector * simd_recip(simd_length_squared(q.vector)));
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_negate(simd_quatf q) {
+  return simd_quaternion(-q.vector);
+}
+  
+static inline SIMD_CFUNC float simd_dot(simd_quatf p, simd_quatf q) {
+  return simd_dot(p.vector, q.vector);
+}
+  
+static inline SIMD_CFUNC float simd_length(simd_quatf q) {
+  return simd_length(q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatf simd_normalize(simd_quatf q) {
+  float length_squared = simd_length_squared(q.vector);
+  if (length_squared == 0) {
+    return simd_quaternion((simd_float4){0,0,0,1});
+  }
+  return simd_quaternion(q.vector * simd_rsqrt(length_squared));
+}
+
+#if defined __arm__ || defined __arm64__
+/*! @abstract Multiplies the vector `v` by the quaternion `q`.
+ *  
+ *  @discussion This IS NOT the action of `q` on `v` (i.e. this is not rotation
+ *  by `q`. That operation is provided by `simd_act(q, v)`. This function is an
+ *  implementation detail and you should not call it directly. It may be
+ *  removed or modified in future versions of the simd module.                */
+static inline SIMD_CFUNC simd_quatf _simd_mul_vq(simd_float3 v, simd_quatf q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion(v.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                         v.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5) +
+                         v.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6));
+}
+#endif
+  
+static inline SIMD_CFUNC simd_float3 simd_act(simd_quatf q, simd_float3 v) {
+#if defined __arm__ || defined __arm64__
+  return simd_mul(q, _simd_mul_vq(v, simd_conjugate(q))).vector.xyz;
+#else
+  #pragma STDC FP_CONTRACT ON
+  simd_float3 t = 2*simd_cross(simd_imag(q),v);
+  return v + simd_real(q)*t + simd_cross(simd_imag(q), t);
+#endif
+}
+
+static SIMD_NOINLINE simd_quatf __tg_log(simd_quatf q) {
+  float real = __tg_log(simd_length_squared(q.vector))/2;
+  if (simd_equal(simd_imag(q), 0)) return _simd_quaternion(0, real);
+  simd_float3 imag = __tg_acos(simd_real(q)/simd_length(q)) * simd_normalize(simd_imag(q));
+  return _simd_quaternion(imag, real);
+}
+  
+static SIMD_NOINLINE simd_quatf __tg_exp(simd_quatf q) {
+  //  angle is actually *twice* the angle of the rotation corresponding to
+  //  the resulting quaternion, which is why we don't simply use the (angle,
+  //  axis) constructor to generate `unit`.
+  float angle = simd_length(simd_imag(q));
+  if (angle == 0) return _simd_quaternion(0, exp(simd_real(q)));
+  simd_float3 axis = simd_normalize(simd_imag(q));
+  simd_quatf unit = _simd_quaternion(sin(angle)*axis, cosf(angle));
+  return simd_mul(exp(simd_real(q)), unit);
+}
+ 
+/*! @abstract Implementation detail of the `simd_quaternion(from, to)`
+ *  initializer.
+ *
+ *  @discussion Computes the quaternion rotation `from` to `to` if they are
+ *  separated by less than 90 degrees. Not numerically stable for larger
+ *  angles. This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static inline SIMD_CFUNC simd_quatf _simd_quaternion_reduced(simd_float3 from, simd_float3 to) {
+  simd_float3 half = simd_normalize(from + to);
+  return _simd_quaternion(simd_cross(from, half), simd_dot(from, half));
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3 from, simd_float3 to) {
+  
+  //  If the angle between from and to is not too big, we can compute the
+  //  rotation accurately using a simple implementation.
+  if (simd_dot(from, to) >= 0) {
+    return _simd_quaternion_reduced(from, to);
+  }
+  
+  //  Because from and to are more than 90 degrees apart, we compute the
+  //  rotation in two stages (from -> half), (half -> to) to preserve numerical
+  //  accuracy.
+  simd_float3 half = from + to;
+  
+  if (simd_length_squared(half) == 0) {
+    //  half is nearly zero, so from and to point in nearly opposite directions
+    //  and the rotation is numerically underspecified. Pick an axis orthogonal
+    //  to the vectors, and use an angle of pi radians.
+    simd_float3 abs_from = simd_abs(from);
+    if (abs_from.x <= abs_from.y && abs_from.x <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){1,0,0})), 0.f);
+    else if (abs_from.y <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){0,1,0})), 0.f);
+    else
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){0,0,1})), 0.f);
+  }
+
+  //  Compute the two-step rotation.                         */
+  half = simd_normalize(half);
+  return simd_mul(_simd_quaternion_reduced(from, half),
+                  _simd_quaternion_reduced(half, to));
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3x3 matrix) {
+  const simd_float3 *mat = matrix.columns;
+  float trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    float r = 2*sqrt(1 + trace);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    float r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float4x4 matrix) {
+  const simd_float4 *mat = matrix.columns;
+  float trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    float r = 2*sqrt(1 + trace);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    float r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    float r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    float rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+/*! @abstract The angle between p and q interpreted as 4-dimensional vectors.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE float _simd_angle(simd_quatf p, simd_quatf q) {
+  return 2*atan2(simd_length(p.vector - q.vector), simd_length(p.vector + q.vector));
+}
+  
+/*! @abstract sin(x)/x.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_CFUNC float _simd_sinc(float x) {
+  if (x == 0) return 1;
+  return sin(x)/x;
+}
+ 
+/*! @abstract Spherical lerp between q0 and q1.
+ *
+ *  @discussion This function may interpolate along either the longer or
+ *  shorter path between q0 and q1; it is used as an implementation detail
+ *  in `simd_slerp` and `simd_slerp_longest`; you should use those functions
+ *  instead of calling this directly.                                         */
+static SIMD_NOINLINE simd_quatf _simd_slerp_internal(simd_quatf q0, simd_quatf q1, float t) {
+  float s = 1 - t;
+  float a = _simd_angle(q0, q1);
+  float r = simd_recip(_simd_sinc(a));
+  return simd_normalize(simd_quaternion(_simd_sinc(s*a)*r*s*q0.vector + _simd_sinc(t*a)*r*t*q1.vector));
+}
+  
+static SIMD_NOINLINE simd_quatf simd_slerp(simd_quatf q0, simd_quatf q1, float t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, q1, t);
+  return _simd_slerp_internal(q0, simd_negate(q1), t);
+}
+
+static SIMD_NOINLINE simd_quatf simd_slerp_longest(simd_quatf q0, simd_quatf q1, float t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, simd_negate(q1), t);
+  return _simd_slerp_internal(q0, q1, t);
+}
+  
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatf _simd_intermediate(simd_quatf q0, simd_quatf q1, simd_quatf q2) {
+  simd_quatf p0 = __tg_log(simd_mul(q0, simd_inverse(q1)));
+  simd_quatf p2 = __tg_log(simd_mul(q2, simd_inverse(q1)));
+  return simd_normalize(simd_mul(q1, __tg_exp(simd_mul(-0.25, simd_add(p0,p2)))));
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatf _simd_squad(simd_quatf q0, simd_quatf qa, simd_quatf qb, simd_quatf q1, float t) {
+  simd_quatf r0 = _simd_slerp_internal(q0, q1, t);
+  simd_quatf r1 = _simd_slerp_internal(qa, qb, t);
+  return _simd_slerp_internal(r0, r1, 2*t*(1 - t));
+}
+  
+static SIMD_NOINLINE simd_quatf simd_spline(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t) {
+  simd_quatf qa = _simd_intermediate(q0, q1, q2);
+  simd_quatf qb = _simd_intermediate(q1, q2, q3);
+  return _simd_squad(q1, qa, qb, q2, t);
+}
+  
+static SIMD_NOINLINE simd_quatf simd_bezier(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t) {
+  simd_quatf q01 = _simd_slerp_internal(q0, q1, t);
+  simd_quatf q12 = _simd_slerp_internal(q1, q2, t);
+  simd_quatf q23 = _simd_slerp_internal(q2, q3, t);
+  simd_quatf q012 = _simd_slerp_internal(q01, q12, t);
+  simd_quatf q123 = _simd_slerp_internal(q12, q23, t);
+  return _simd_slerp_internal(q012, q123, t);
+}
+
+/*  MARK: - C and Objective-C double interfaces                                */
+
+/*! @abstract Constructs a quaternion from four scalar values.
+ *
+ *  @param ix The first component of the imaginary (vector) part.
+ *  @param iy The second component of the imaginary (vector) part.
+ *  @param iz The third component of the imaginary (vector) part.
+ *
+ *  @param r The real (scalar) part.                                          */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double ix, double iy, double iz, double r) {
+  return (simd_quatd){ { ix, iy, iz, r } };
+}
+  
+/*! @abstract Constructs a quaternion from an array of four scalars.
+ *
+ *  @discussion Note that the imaginary part of the quaternion comes from 
+ *  array elements 0, 1, and 2, and the real part comes from element 3.       */
+static inline SIMD_NONCONST simd_quatd simd_quaternion(const double xyzr[4]) {
+  return (simd_quatd){ *(const simd_packed_double4 *)xyzr };
+}
+  
+/*! @abstract Constructs a quaternion from a four-element vector.
+ *
+ *  @discussion Note that the imaginary (vector) part of the quaternion comes
+ *  from lanes 0, 1, and 2 of the vector, and the real (scalar) part comes from
+ *  lane 3.                                                                   */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(simd_double4 xyzr) {
+  return (simd_quatd){ xyzr };
+}
+  
+/*! @abstract Constructs a quaternion that rotates by `angle` radians about
+ *  `axis`.                                                                   */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double angle, simd_double3 axis);
+  
+/*! @abstract Construct a quaternion that rotates from one vector to another.
+ *
+ *  @param from A normalized three-element vector.
+ *  @param to A normalized three-element vector.
+ *
+ *  @discussion The rotation axis is `simd_cross(from, to)`. If `from` and
+ *  `to` point in opposite directions (to within machine precision), an
+ *  arbitrary rotation axis is chosen, and the angle is pi radians.           */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3 from, simd_double3 to);
+
+/*! @abstract Construct a quaternion from a 3x3 rotation `matrix`.
+ *
+ *  @discussion If `matrix` is not orthogonal with determinant 1, the result
+ *  is undefined.                                                             */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3x3 matrix);
+
+/*! @abstract Construct a quaternion from a 4x4 rotation `matrix`.
+ *
+ *  @discussion The last row and column of the matrix are ignored. This
+ *  function is equivalent to calling simd_quaternion with the upper-left 3x3
+ *  submatrix                .                                                */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double4x4 matrix);
+  
+/*! @abstract The real (scalar) part of the quaternion `q`.                   */
+static inline SIMD_CFUNC double simd_real(simd_quatd q) {
+  return q.vector.w;
+}
+  
+/*! @abstract The imaginary (vector) part of the quaternion `q`.              */
+static inline SIMD_CFUNC simd_double3 simd_imag(simd_quatd q) {
+  return q.vector.xyz;
+}
+  
+/*! @abstract The angle (in radians) of rotation represented by `q`.          */
+static inline SIMD_CFUNC double simd_angle(simd_quatd q);
+  
+/*! @abstract The normalized axis (a 3-element vector) around which the
+ *  action of the quaternion `q` rotates.                                     */
+static inline SIMD_CFUNC simd_double3 simd_axis(simd_quatd q);
+  
+/*! @abstract The sum of the quaternions `p` and `q`.                         */
+static inline SIMD_CFUNC simd_quatd simd_add(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The difference of the quaternions `p` and `q`.                  */
+static inline SIMD_CFUNC simd_quatd simd_sub(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The product of the quaternions `p` and `q`.                     */
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd q, double a);
+  
+/*! @abstract The quaternion `q` scaled by the real value `a`.                */
+static inline SIMD_CFUNC simd_quatd simd_mul(double a, simd_quatd q);
+  
+/*! @abstract The conjugate of the quaternion `q`.                            */
+static inline SIMD_CFUNC simd_quatd simd_conjugate(simd_quatd q);
+  
+/*! @abstract The (multiplicative) inverse of the quaternion `q`.             */
+static inline SIMD_CFUNC simd_quatd simd_inverse(simd_quatd q);
+  
+/*! @abstract The negation (additive inverse) of the quaternion `q`.          */
+static inline SIMD_CFUNC simd_quatd simd_negate(simd_quatd q);
+  
+/*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ *  four-dimensional vectors.                                                 */
+static inline SIMD_CFUNC double simd_dot(simd_quatd p, simd_quatd q);
+  
+/*! @abstract The length of the quaternion `q`.                               */
+static inline SIMD_CFUNC double simd_length(simd_quatd q);
+  
+/*! @abstract The unit quaternion obtained by normalizing `q`.                */
+static inline SIMD_CFUNC simd_quatd simd_normalize(simd_quatd q);
+  
+/*! @abstract Rotates the vector `v` by the quaternion `q`.                   */
+static inline SIMD_CFUNC simd_double3 simd_act(simd_quatd q, simd_double3 v);
+  
+/*! @abstract Logarithm of the quaternion `q`.
+ *  @discussion Do not call this function directly; use `log(q)` instead.
+ *
+ *  We can write a quaternion `q` in the form: `r(cos(t) + sin(t)v)` where
+ *  `r` is the length of `q`, `t` is an angle, and `v` is a unit 3-vector.
+ *  The logarithm of `q` is `log(r) + tv`, just like the logarithm of the
+ *  complex number `r*(cos(t) + i sin(t))` is `log(r) + it`.
+ *
+ *  Note that this function is not robust against poorly-scaled non-unit
+ *  quaternions, because it is primarily used for spline interpolation of
+ *  unit quaternions. If you need to compute a robust logarithm of general
+ *  quaternions, you can use the following approach:
+ *
+ *    scale = simd_reduce_max(simd_abs(q.vector));
+ *    logq = log(simd_recip(scale)*q);
+ *    logq.real += log(scale);
+ *    return logq;                                                            */
+static SIMD_NOINLINE simd_quatd __tg_log(simd_quatd q);
+    
+/*! @abstract Inverse of `log( )`; the exponential map on quaternions.
+ *  @discussion Do not call this function directly; use `exp(q)` instead.     */
+static SIMD_NOINLINE simd_quatd __tg_exp(simd_quatd q);
+  
+/*! @abstract Spherical linear interpolation along the shortest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatd simd_slerp(simd_quatd q0, simd_quatd q1, double t);
+
+/*! @abstract Spherical linear interpolation along the longest arc between
+ *  quaternions `q0` and `q1`.                                                */
+static SIMD_NOINLINE simd_quatd simd_slerp_longest(simd_quatd q0, simd_quatd q1, double t);
+
+/*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ *  @discussion The function interpolates between q1 and q2. q0 is the left
+ *  endpoint of the previous interval, and q3 is the right endpoint of the next
+ *  interval. Use this function to smoothly interpolate between a sequence of
+ *  rotations.                                                                */
+static SIMD_NOINLINE simd_quatd simd_spline(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t);
+
+/*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ *  @discussion The function treats q0 ... q3 as control points and uses slerp
+ *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+ *  interpolation are thus q0 and q3, and the curve will not generally pass
+ *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+ *  curve does not hold on the sphere.                                        */
+static SIMD_NOINLINE simd_quatd simd_bezier(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t);
+  
+#ifdef __cplusplus
+} /* extern "C" */
+/*  MARK: - C++ double interfaces                                              */
+
+namespace simd {
+  struct quatd : ::simd_quatd {
+    /*! @abstract The identity quaternion.                                    */
+    quatd( ) : ::simd_quatd(::simd_quaternion((double4){0,0,0,1})) { }
+    
+    /*! @abstract Constructs a C++ quaternion from a C quaternion.            */
+    quatd(::simd_quatd q) : ::simd_quatd(q) { }
+    
+    /*! @abstract Constructs a quaternion from components.                    */
+    quatd(double ix, double iy, double iz, double r) : ::simd_quatd(::simd_quaternion(ix, iy, iz, r)) { }
+    
+    /*! @abstract Constructs a quaternion from an array of scalars.           */
+    quatd(const double xyzr[4]) : ::simd_quatd(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Constructs a quaternion from a vector.                      */
+    quatd(double4 xyzr) : ::simd_quatd(::simd_quaternion(xyzr)) { }
+    
+    /*! @abstract Quaternion representing rotation about `axis` by `angle` 
+     *  radians.                                                              */
+    quatd(double angle, double3 axis) : ::simd_quatd(::simd_quaternion(angle, axis)) { }
+    
+    /*! @abstract Quaternion that rotates `from` into `to`.                   */
+    quatd(double3 from, double3 to) : ::simd_quatd(::simd_quaternion(from, to)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatd(::simd_double3x3 matrix) : ::simd_quatd(::simd_quaternion(matrix)) { }
+    
+    /*! @abstract Constructs a quaternion from a rotation matrix.             */
+    quatd(::simd_double4x4 matrix) : ::simd_quatd(::simd_quaternion(matrix)) { }
+  
+    /*! @abstract The real (scalar) part of the quaternion.                   */
+    double real(void) const { return ::simd_real(*this); }
+    
+    /*! @abstract The imaginary (vector) part of the quaternion.              */
+    double3 imag(void) const { return ::simd_imag(*this); }
+    
+    /*! @abstract The angle the quaternion rotates by.                        */
+    double angle(void) const { return ::simd_angle(*this); }
+    
+    /*! @abstract The axis the quaternion rotates about.                      */
+    double3 axis(void) const { return ::simd_axis(*this); }
+    
+    /*! @abstract The length of the quaternion.                               */
+    double length(void) const { return ::simd_length(*this); }
+    
+    /*! @abstract Act on the vector `v` by rotation.                          */
+    double3  operator()(const ::simd_double3 v) const { return ::simd_act(*this, v); }
+  };
+  
+  static SIMD_CPPFUNC quatd operator+(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_add(p, q); }
+  static SIMD_CPPFUNC quatd operator-(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_sub(p, q); }
+  static SIMD_CPPFUNC quatd operator-(const ::simd_quatd p) { return ::simd_negate(p); }
+  static SIMD_CPPFUNC quatd operator*(const double r, const ::simd_quatd p) { return ::simd_mul(r, p); }
+  static SIMD_CPPFUNC quatd operator*(const ::simd_quatd p, const double r) { return ::simd_mul(p, r); }
+  static SIMD_CPPFUNC quatd operator*(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_mul(p, q); }
+  static SIMD_CPPFUNC quatd operator/(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_mul(p, ::simd_inverse(q)); }
+  static SIMD_CPPFUNC quatd operator+=(quatd &p, const ::simd_quatd q) { return p = p+q; }
+  static SIMD_CPPFUNC quatd operator-=(quatd &p, const ::simd_quatd q) { return p = p-q; }
+  static SIMD_CPPFUNC quatd operator*=(quatd &p, const double r) { return p = p*r; }
+  static SIMD_CPPFUNC quatd operator*=(quatd &p, const ::simd_quatd q) { return p = p*q; }
+  static SIMD_CPPFUNC quatd operator/=(quatd &p, const ::simd_quatd q) { return p = p/q; }
+  
+  /*! @abstract The conjugate of the quaternion `q`.                          */
+  static SIMD_CPPFUNC quatd conjugate(const ::simd_quatd p) { return ::simd_conjugate(p); }
+  
+  /*! @abstract The (multiplicative) inverse of the quaternion `q`.           */
+  static SIMD_CPPFUNC quatd inverse(const ::simd_quatd p) { return ::simd_inverse(p); }
+
+  /*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+   *  four-dimensional vectors.                                               */
+  static SIMD_CPPFUNC double dot(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_dot(p, q); }
+  
+  /*! @abstract The unit quaternion obtained by normalizing `q`.              */
+  static SIMD_CPPFUNC quatd normalize(const ::simd_quatd p) { return ::simd_normalize(p); }
+
+  /*! @abstract logarithm of the quaternion `q`.                              */
+  static SIMD_CPPFUNC quatd log(const ::simd_quatd q) { return ::__tg_log(q); }
+
+  /*! @abstract exponential map of quaterion `q`.                             */
+  static SIMD_CPPFUNC quatd exp(const ::simd_quatd q) { return ::__tg_exp(q); }
+  
+  /*! @abstract Spherical linear interpolation along the shortest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatd slerp(const ::simd_quatd p0, const ::simd_quatd p1, double t) { return ::simd_slerp(p0, p1, t); }
+  
+  /*! @abstract Spherical linear interpolation along the longest arc between
+   *  quaternions `q0` and `q1`.                                              */
+  static SIMD_CPPFUNC quatd slerp_longest(const ::simd_quatd p0, const ::simd_quatd p1, double t) { return ::simd_slerp_longest(p0, p1, t); }
+  
+  /*! @abstract Interpolate between quaternions along a spherical cubic spline.
+   *
+   *  @discussion The function interpolates between q1 and q2. q0 is the left
+   *  endpoint of the previous interval, and q3 is the right endpoint of the next
+   *  interval. Use this function to smoothly interpolate between a sequence of
+   *  rotations.                                                              */
+  static SIMD_CPPFUNC quatd spline(const ::simd_quatd p0, const ::simd_quatd p1, const ::simd_quatd p2, const ::simd_quatd p3, double t) { return ::simd_spline(p0, p1, p2, p3, t); }
+  
+  /*! @abstract Spherical cubic Bezier interpolation between quaternions.
+   *
+   *  @discussion The function treats q0 ... q3 as control points and uses slerp
+   *  in place of lerp in the De Castlejeau algorithm. The endpoints of
+   *  interpolation are thus q0 and q3, and the curve will not generally pass
+   *  through q1 or q2. Note that the convex hull property of "standard" Bezier
+   *  curve does not hold on the sphere.                                      */
+  static SIMD_CPPFUNC quatd bezier(const ::simd_quatd p0, const ::simd_quatd p1, const ::simd_quatd p2, const ::simd_quatd p3, double t) { return ::simd_bezier(p0, p1, p2, p3, t); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+  
+/*  MARK: - double implementations                                             */
+
+#include <simd/math.h>
+#include <simd/geometry.h>
+  
+/*  tg_promote is implementation gobbledygook that enables the compile-time
+ *  dispatching in tgmath.h to work its magic.                                */
+static simd_quatd __attribute__((__overloadable__)) __tg_promote(simd_quatd);
+  
+/*! @abstract Constructs a quaternion from imaginary and real parts.
+ *  @discussion This function is hidden behind an underscore to avoid confusion
+ *  with the angle-axis constructor.                                          */
+static inline SIMD_CFUNC simd_quatd _simd_quaternion(simd_double3 imag, double real) {
+  return simd_quaternion(simd_make_double4(imag, real));
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double angle, simd_double3 axis) {
+  return _simd_quaternion(sin(angle/2) * axis, cos(angle/2));
+}
+  
+static inline SIMD_CFUNC double simd_angle(simd_quatd q) {
+  return 2*atan2(simd_length(q.vector.xyz), q.vector.w);
+}
+  
+static inline SIMD_CFUNC simd_double3 simd_axis(simd_quatd q) {
+  return simd_normalize(q.vector.xyz);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_add(simd_quatd p, simd_quatd q) {
+  return simd_quaternion(p.vector + q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_sub(simd_quatd p, simd_quatd q) {
+  return simd_quaternion(p.vector - q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd p, simd_quatd q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion((p.vector.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                          p.vector.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5)) +
+                         (p.vector.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6) +
+                          p.vector.w * q.vector));
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd q, double a) {
+  return simd_quaternion(a * q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(double a, simd_quatd q) {
+  return simd_mul(q,a);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_conjugate(simd_quatd q) {
+  return simd_quaternion(q.vector * (simd_double4){-1,-1,-1, 1});
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_inverse(simd_quatd q) {
+  return simd_quaternion(simd_conjugate(q).vector * simd_recip(simd_length_squared(q.vector)));
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_negate(simd_quatd q) {
+  return simd_quaternion(-q.vector);
+}
+  
+static inline SIMD_CFUNC double simd_dot(simd_quatd p, simd_quatd q) {
+  return simd_dot(p.vector, q.vector);
+}
+  
+static inline SIMD_CFUNC double simd_length(simd_quatd q) {
+  return simd_length(q.vector);
+}
+  
+static inline SIMD_CFUNC simd_quatd simd_normalize(simd_quatd q) {
+  double length_squared = simd_length_squared(q.vector);
+  if (length_squared == 0) {
+    return simd_quaternion((simd_double4){0,0,0,1});
+  }
+  return simd_quaternion(q.vector * simd_rsqrt(length_squared));
+}
+
+#if defined __arm__ || defined __arm64__
+/*! @abstract Multiplies the vector `v` by the quaternion `q`.
+ *  
+ *  @discussion This IS NOT the action of `q` on `v` (i.e. this is not rotation
+ *  by `q`. That operation is provided by `simd_act(q, v)`. This function is an
+ *  implementation detail and you should not call it directly. It may be
+ *  removed or modified in future versions of the simd module.                */
+static inline SIMD_CFUNC simd_quatd _simd_mul_vq(simd_double3 v, simd_quatd q) {
+  #pragma STDC FP_CONTRACT ON
+  return simd_quaternion(v.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+                         v.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5) +
+                         v.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6));
+}
+#endif
+  
+static inline SIMD_CFUNC simd_double3 simd_act(simd_quatd q, simd_double3 v) {
+#if defined __arm__ || defined __arm64__
+  return simd_mul(q, _simd_mul_vq(v, simd_conjugate(q))).vector.xyz;
+#else
+  #pragma STDC FP_CONTRACT ON
+  simd_double3 t = 2*simd_cross(simd_imag(q),v);
+  return v + simd_real(q)*t + simd_cross(simd_imag(q), t);
+#endif
+}
+
+static SIMD_NOINLINE simd_quatd __tg_log(simd_quatd q) {
+  double real = __tg_log(simd_length_squared(q.vector))/2;
+  if (simd_equal(simd_imag(q), 0)) return _simd_quaternion(0, real);
+  simd_double3 imag = __tg_acos(simd_real(q)/simd_length(q)) * simd_normalize(simd_imag(q));
+  return _simd_quaternion(imag, real);
+}
+  
+static SIMD_NOINLINE simd_quatd __tg_exp(simd_quatd q) {
+  //  angle is actually *twice* the angle of the rotation corresponding to
+  //  the resulting quaternion, which is why we don't simply use the (angle,
+  //  axis) constructor to generate `unit`.
+  double angle = simd_length(simd_imag(q));
+  if (angle == 0) return _simd_quaternion(0, exp(simd_real(q)));
+  simd_double3 axis = simd_normalize(simd_imag(q));
+  simd_quatd unit = _simd_quaternion(sin(angle)*axis, cosf(angle));
+  return simd_mul(exp(simd_real(q)), unit);
+}
+ 
+/*! @abstract Implementation detail of the `simd_quaternion(from, to)`
+ *  initializer.
+ *
+ *  @discussion Computes the quaternion rotation `from` to `to` if they are
+ *  separated by less than 90 degrees. Not numerically stable for larger
+ *  angles. This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static inline SIMD_CFUNC simd_quatd _simd_quaternion_reduced(simd_double3 from, simd_double3 to) {
+  simd_double3 half = simd_normalize(from + to);
+  return _simd_quaternion(simd_cross(from, half), simd_dot(from, half));
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3 from, simd_double3 to) {
+  
+  //  If the angle between from and to is not too big, we can compute the
+  //  rotation accurately using a simple implementation.
+  if (simd_dot(from, to) >= 0) {
+    return _simd_quaternion_reduced(from, to);
+  }
+  
+  //  Because from and to are more than 90 degrees apart, we compute the
+  //  rotation in two stages (from -> half), (half -> to) to preserve numerical
+  //  accuracy.
+  simd_double3 half = from + to;
+  
+  if (simd_length_squared(half) == 0) {
+    //  half is nearly zero, so from and to point in nearly opposite directions
+    //  and the rotation is numerically underspecified. Pick an axis orthogonal
+    //  to the vectors, and use an angle of pi radians.
+    simd_double3 abs_from = simd_abs(from);
+    if (abs_from.x <= abs_from.y && abs_from.x <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){1,0,0})), 0.f);
+    else if (abs_from.y <= abs_from.z)
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){0,1,0})), 0.f);
+    else
+      return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){0,0,1})), 0.f);
+  }
+
+  //  Compute the two-step rotation.                         */
+  half = simd_normalize(half);
+  return simd_mul(_simd_quaternion_reduced(from, half),
+                  _simd_quaternion_reduced(half, to));
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3x3 matrix) {
+  const simd_double3 *mat = matrix.columns;
+  double trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    double r = 2*sqrt(1 + trace);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    double r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double4x4 matrix) {
+  const simd_double4 *mat = matrix.columns;
+  double trace = mat[0][0] + mat[1][1] + mat[2][2];
+  if (trace >= 0.0) {
+    double r = 2*sqrt(1 + trace);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]),
+                           rinv*(mat[0][1] - mat[1][0]),
+                           r/4);
+  } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(r/4,
+                           rinv*(mat[0][1] + mat[1][0]),
+                           rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] - mat[2][1]));
+  } else if (mat[1][1] >= mat[2][2]) {
+    double r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+                           r/4,
+                           rinv*(mat[1][2] + mat[2][1]),
+                           rinv*(mat[2][0] - mat[0][2]));
+  } else {
+    double r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+    double rinv = simd_recip(r);
+    return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+                           rinv*(mat[1][2] + mat[2][1]),
+                           r/4,
+                           rinv*(mat[0][1] - mat[1][0]));
+  }
+}
+  
+/*! @abstract The angle between p and q interpreted as 4-dimensional vectors.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE double _simd_angle(simd_quatd p, simd_quatd q) {
+  return 2*atan2(simd_length(p.vector - q.vector), simd_length(p.vector + q.vector));
+}
+  
+/*! @abstract sin(x)/x.
+ *
+ *  @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_CFUNC double _simd_sinc(double x) {
+  if (x == 0) return 1;
+  return sin(x)/x;
+}
+ 
+/*! @abstract Spherical lerp between q0 and q1.
+ *
+ *  @discussion This function may interpolate along either the longer or
+ *  shorter path between q0 and q1; it is used as an implementation detail
+ *  in `simd_slerp` and `simd_slerp_longest`; you should use those functions
+ *  instead of calling this directly.                                         */
+static SIMD_NOINLINE simd_quatd _simd_slerp_internal(simd_quatd q0, simd_quatd q1, double t) {
+  double s = 1 - t;
+  double a = _simd_angle(q0, q1);
+  double r = simd_recip(_simd_sinc(a));
+  return simd_normalize(simd_quaternion(_simd_sinc(s*a)*r*s*q0.vector + _simd_sinc(t*a)*r*t*q1.vector));
+}
+  
+static SIMD_NOINLINE simd_quatd simd_slerp(simd_quatd q0, simd_quatd q1, double t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, q1, t);
+  return _simd_slerp_internal(q0, simd_negate(q1), t);
+}
+
+static SIMD_NOINLINE simd_quatd simd_slerp_longest(simd_quatd q0, simd_quatd q1, double t) {
+  if (simd_dot(q0, q1) >= 0)
+    return _simd_slerp_internal(q0, simd_negate(q1), t);
+  return _simd_slerp_internal(q0, q1, t);
+}
+  
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatd _simd_intermediate(simd_quatd q0, simd_quatd q1, simd_quatd q2) {
+  simd_quatd p0 = __tg_log(simd_mul(q0, simd_inverse(q1)));
+  simd_quatd p2 = __tg_log(simd_mul(q2, simd_inverse(q1)));
+  return simd_normalize(simd_mul(q1, __tg_exp(simd_mul(-0.25, simd_add(p0,p2)))));
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ *  call it directly. It may be removed or modified in future versions of the
+ *  simd module.                                                              */
+static SIMD_NOINLINE simd_quatd _simd_squad(simd_quatd q0, simd_quatd qa, simd_quatd qb, simd_quatd q1, double t) {
+  simd_quatd r0 = _simd_slerp_internal(q0, q1, t);
+  simd_quatd r1 = _simd_slerp_internal(qa, qb, t);
+  return _simd_slerp_internal(r0, r1, 2*t*(1 - t));
+}
+  
+static SIMD_NOINLINE simd_quatd simd_spline(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t) {
+  simd_quatd qa = _simd_intermediate(q0, q1, q2);
+  simd_quatd qb = _simd_intermediate(q1, q2, q3);
+  return _simd_squad(q1, qa, qb, q2, t);
+}
+  
+static SIMD_NOINLINE simd_quatd simd_bezier(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t) {
+  simd_quatd q01 = _simd_slerp_internal(q0, q1, t);
+  simd_quatd q12 = _simd_slerp_internal(q1, q2, t);
+  simd_quatd q23 = _simd_slerp_internal(q2, q3, t);
+  simd_quatd q012 = _simd_slerp_internal(q01, q12, t);
+  simd_quatd q123 = _simd_slerp_internal(q12, q23, t);
+  return _simd_slerp_internal(q012, q123, t);
+}
+
+#ifdef __cplusplus
+}      /* extern "C"  */
+#endif /* __cplusplus */
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_QUATERNIONS */
+\ No newline at end of file