aboutsummaryrefslogtreecommitdiff
path: root/lib/libc/include/aarch64-macos-gnu/simd
diff options
context:
space:
mode:
authorJakub Konka <kubkon@jakubkonka.com>2020-12-19 12:13:03 +0100
committerGitHub <noreply@github.com>2020-12-19 12:13:03 +0100
commitb090451646904006ac41b2b99e532489d89ea837 (patch)
treeb0a5ec423dc42f5bf6dcf533b90f8c67a69e9b99 /lib/libc/include/aarch64-macos-gnu/simd
parent506af7e52e0985b410ea089bf5fa3247ab2377cb (diff)
parent3f81ddb735bfc8e6fb1776df7407ace213816252 (diff)
downloadzig-b090451646904006ac41b2b99e532489d89ea837.tar.gz
zig-b090451646904006ac41b2b99e532489d89ea837.zip
Merge pull request #7318 from kubkon/cc-macho
stage1: cross compile to x86_64 and arm64 macOS from anywhere with LLVM
Diffstat (limited to 'lib/libc/include/aarch64-macos-gnu/simd')
-rw-r--r--lib/libc/include/aarch64-macos-gnu/simd/common.h4458
-rw-r--r--lib/libc/include/aarch64-macos-gnu/simd/conversion.h1966
-rw-r--r--lib/libc/include/aarch64-macos-gnu/simd/logic.h1315
-rw-r--r--lib/libc/include/aarch64-macos-gnu/simd/math.h5380
-rw-r--r--lib/libc/include/aarch64-macos-gnu/simd/packed.h1031
-rw-r--r--lib/libc/include/aarch64-macos-gnu/simd/quaternion.h1194
6 files changed, 15344 insertions, 0 deletions
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/common.h b/lib/libc/include/aarch64-macos-gnu/simd/common.h
new file mode 100644
index 0000000000..5408c535fd
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/common.h
@@ -0,0 +1,4458 @@
+/*! @header
+ * The interfaces declared in this header provide "common" elementwise
+ * operations that are neither math nor logic functions. These are available
+ * only for floating-point vectors and scalars, except for min, max, abs,
+ * clamp, and the reduce operations, which also support integer vectors.
+ *
+ * simd_abs(x) Absolute value of x. Also available as fabs
+ * for floating-point vectors. If x is the
+ * smallest signed integer, x is returned.
+ *
+ * simd_max(x,y) Returns the maximum of x and y. Also available
+ * as fmax for floating-point vectors.
+ *
+ * simd_min(x,y) Returns the minimum of x and y. Also available
+ * as fmin for floating-point vectors.
+ *
+ * simd_clamp(x,min,max) x clamped to the range [min, max].
+ *
+ * simd_sign(x) -1 if x is less than zero, 0 if x is zero or
+ * NaN, and +1 if x is greater than zero.
+ *
+ * simd_mix(x,y,t) If t is not in the range [0,1], the result is
+ * undefined. Otherwise the result is x+(y-x)*t,
+ * which linearly interpolates between x and y.
+ *
+ * simd_recip(x) An approximation to 1/x. If x is very near the
+ * limits of representable values, or is infinity
+ * or NaN, the result is undefined. There are
+ * two variants of this function:
+ *
+ * simd_precise_recip(x)
+ *
+ * and
+ *
+ * simd_fast_recip(x).
+ *
+ * The "precise" variant is accurate to a few ULPs,
+ * whereas the "fast" variant may have as little
+ * as 11 bits of accuracy in float and about 22
+ * bits in double.
+ *
+ * The function simd_recip(x) resolves to
+ * simd_precise_recip(x) ordinarily, but to
+ * simd_fast_recip(x) when used in a translation
+ * unit compiled with -ffast-math (when
+ * -ffast-math is in effect, you may still use the
+ * precise version of this function by calling it
+ * explicitly by name).
+ *
+ * simd_rsqrt(x) An approximation to 1/sqrt(x). If x is
+ * infinity or NaN, the result is undefined.
+ * There are two variants of this function:
+ *
+ * simd_precise_rsqrt(x)
+ *
+ * and
+ *
+ * simd_fast_rsqrt(x).
+ *
+ * The "precise" variant is accurate to a few ULPs,
+ * whereas the "fast" variant may have as little
+ * as 11 bits of accuracy in float and about 22
+ * bits in double.
+ *
+ * The function simd_rsqrt(x) resolves to
+ * simd_precise_rsqrt(x) ordinarily, but to
+ * simd_fast_rsqrt(x) when used in a translation
+ * unit compiled with -ffast-math (when
+ * -ffast-math is in effect, you may still use the
+ * precise version of this function by calling it
+ * explicitly by name).
+ *
+ * simd_fract(x) The "fractional part" of x, which lies strictly
+ * in the range [0, 0x1.fffffep-1].
+ *
+ * simd_step(edge,x) 0 if x < edge, and 1 otherwise.
+ *
+ * simd_smoothstep(edge0,edge1,x) 0 if x <= edge0, 1 if x >= edge1, and
+ * a Hermite interpolation between 0 and 1 if
+ * edge0 < x < edge1.
+ *
+ * simd_reduce_add(x) Sum of the elements of x.
+ *
+ * simd_reduce_min(x) Minimum of the elements of x.
+ *
+ * simd_reduce_max(x) Maximum of the elements of x.
+ *
+ * simd_equal(x,y) True if and only if every lane of x is equal
+ * to the corresponding lane of y.
+ *
+ * The following common functions are available in the simd:: namespace:
+ *
+ * C++ Function Equivalent C Function
+ * --------------------------------------------------------------------
+ * simd::abs(x) simd_abs(x)
+ * simd::max(x,y) simd_max(x,y)
+ * simd::min(x,y) simd_min(x,y)
+ * simd::clamp(x,min,max) simd_clamp(x,min,max)
+ * simd::sign(x) simd_sign(x)
+ * simd::mix(x,y,t) simd_mix(x,y,t)
+ * simd::recip(x) simd_recip(x)
+ * simd::rsqrt(x) simd_rsqrt(x)
+ * simd::fract(x) simd_fract(x)
+ * simd::step(edge,x) simd_step(edge,x)
+ * simd::smoothstep(e0,e1,x) simd_smoothstep(e0,e1,x)
+ * simd::reduce_add(x) simd_reduce_add(x)
+ * simd::reduce_max(x) simd_reduce_max(x)
+ * simd::reduce_min(x) simd_reduce_min(x)
+ * simd::equal(x,y) simd_equal(x,y)
+ *
+ * simd::precise::recip(x) simd_precise_recip(x)
+ * simd::precise::rsqrt(x) simd_precise_rsqrt(x)
+ *
+ * simd::fast::recip(x) simd_fast_recip(x)
+ * simd::fast::rsqrt(x) simd_fast_rsqrt(x)
+ *
+ * @copyright 2014-2017 Apple, Inc. All rights reserved.
+ * @unsorted */
+
+#ifndef SIMD_COMMON_HEADER
+#define SIMD_COMMON_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+#include <simd/math.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char2 simd_abs(simd_char2 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char3 simd_abs(simd_char3 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char4 simd_abs(simd_char4 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char8 simd_abs(simd_char8 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char16 simd_abs(simd_char16 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char32 simd_abs(simd_char32 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_char64 simd_abs(simd_char64 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_short2 simd_abs(simd_short2 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_short3 simd_abs(simd_short3 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_short4 simd_abs(simd_short4 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_short8 simd_abs(simd_short8 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_short16 simd_abs(simd_short16 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_short32 simd_abs(simd_short32 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_int2 simd_abs(simd_int2 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_int3 simd_abs(simd_int3 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_int4 simd_abs(simd_int4 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_int8 simd_abs(simd_int8 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_int16 simd_abs(simd_int16 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_float2 simd_abs(simd_float2 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_float3 simd_abs(simd_float3 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_float4 simd_abs(simd_float4 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_float8 simd_abs(simd_float8 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_float16 simd_abs(simd_float16 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_long3 simd_abs(simd_long3 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_long8 simd_abs(simd_long8 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_double2 simd_abs(simd_double2 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_double3 simd_abs(simd_double3 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_double4 simd_abs(simd_double4 x);
+/*! @abstract The elementwise absolute value of x. */
+static inline SIMD_CFUNC simd_double8 simd_abs(simd_double8 x);
+/*! @abstract The elementwise absolute value of x.
+ * @discussion Deprecated. Use simd_abs(x) instead. */
+#define vector_abs simd_abs
+
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char2 simd_max(simd_char2 x, simd_char2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char3 simd_max(simd_char3 x, simd_char3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char4 simd_max(simd_char4 x, simd_char4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char8 simd_max(simd_char8 x, simd_char8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char16 simd_max(simd_char16 x, simd_char16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char32 simd_max(simd_char32 x, simd_char32 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_char64 simd_max(simd_char64 x, simd_char64 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar2 simd_max(simd_uchar2 x, simd_uchar2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar3 simd_max(simd_uchar3 x, simd_uchar3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar4 simd_max(simd_uchar4 x, simd_uchar4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar8 simd_max(simd_uchar8 x, simd_uchar8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar16 simd_max(simd_uchar16 x, simd_uchar16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar32 simd_max(simd_uchar32 x, simd_uchar32 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uchar64 simd_max(simd_uchar64 x, simd_uchar64 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_short2 simd_max(simd_short2 x, simd_short2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_short3 simd_max(simd_short3 x, simd_short3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_short4 simd_max(simd_short4 x, simd_short4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_short8 simd_max(simd_short8 x, simd_short8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_short16 simd_max(simd_short16 x, simd_short16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_short32 simd_max(simd_short32 x, simd_short32 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ushort2 simd_max(simd_ushort2 x, simd_ushort2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ushort3 simd_max(simd_ushort3 x, simd_ushort3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ushort4 simd_max(simd_ushort4 x, simd_ushort4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ushort8 simd_max(simd_ushort8 x, simd_ushort8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ushort16 simd_max(simd_ushort16 x, simd_ushort16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ushort32 simd_max(simd_ushort32 x, simd_ushort32 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_int2 simd_max(simd_int2 x, simd_int2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_int3 simd_max(simd_int3 x, simd_int3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_int4 simd_max(simd_int4 x, simd_int4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_int8 simd_max(simd_int8 x, simd_int8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_int16 simd_max(simd_int16 x, simd_int16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uint2 simd_max(simd_uint2 x, simd_uint2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uint3 simd_max(simd_uint3 x, simd_uint3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uint4 simd_max(simd_uint4 x, simd_uint4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uint8 simd_max(simd_uint8 x, simd_uint8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_uint16 simd_max(simd_uint16 x, simd_uint16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC float simd_max(float x, float y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_float2 simd_max(simd_float2 x, simd_float2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_float3 simd_max(simd_float3 x, simd_float3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_float4 simd_max(simd_float4 x, simd_float4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_float8 simd_max(simd_float8 x, simd_float8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_float16 simd_max(simd_float16 x, simd_float16 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_long2 simd_max(simd_long2 x, simd_long2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_long3 simd_max(simd_long3 x, simd_long3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_long4 simd_max(simd_long4 x, simd_long4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_long8 simd_max(simd_long8 x, simd_long8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ulong2 simd_max(simd_ulong2 x, simd_ulong2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ulong3 simd_max(simd_ulong3 x, simd_ulong3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ulong4 simd_max(simd_ulong4 x, simd_ulong4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_ulong8 simd_max(simd_ulong8 x, simd_ulong8 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC double simd_max(double x, double y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_double2 simd_max(simd_double2 x, simd_double2 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_double3 simd_max(simd_double3 x, simd_double3 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_double4 simd_max(simd_double4 x, simd_double4 y);
+/*! @abstract The elementwise maximum of x and y. */
+static inline SIMD_CFUNC simd_double8 simd_max(simd_double8 x, simd_double8 y);
+/*! @abstract The elementwise maximum of x and y.
+ * @discussion Deprecated. Use simd_max(x,y) instead. */
+#define vector_max simd_max
+
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char2 simd_min(simd_char2 x, simd_char2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char3 simd_min(simd_char3 x, simd_char3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char4 simd_min(simd_char4 x, simd_char4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char8 simd_min(simd_char8 x, simd_char8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char16 simd_min(simd_char16 x, simd_char16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char32 simd_min(simd_char32 x, simd_char32 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_char64 simd_min(simd_char64 x, simd_char64 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar2 simd_min(simd_uchar2 x, simd_uchar2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar3 simd_min(simd_uchar3 x, simd_uchar3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar4 simd_min(simd_uchar4 x, simd_uchar4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar8 simd_min(simd_uchar8 x, simd_uchar8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar16 simd_min(simd_uchar16 x, simd_uchar16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar32 simd_min(simd_uchar32 x, simd_uchar32 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uchar64 simd_min(simd_uchar64 x, simd_uchar64 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_short2 simd_min(simd_short2 x, simd_short2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_short3 simd_min(simd_short3 x, simd_short3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_short4 simd_min(simd_short4 x, simd_short4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_short8 simd_min(simd_short8 x, simd_short8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_short16 simd_min(simd_short16 x, simd_short16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_short32 simd_min(simd_short32 x, simd_short32 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ushort2 simd_min(simd_ushort2 x, simd_ushort2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ushort3 simd_min(simd_ushort3 x, simd_ushort3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ushort4 simd_min(simd_ushort4 x, simd_ushort4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ushort8 simd_min(simd_ushort8 x, simd_ushort8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ushort16 simd_min(simd_ushort16 x, simd_ushort16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ushort32 simd_min(simd_ushort32 x, simd_ushort32 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_int2 simd_min(simd_int2 x, simd_int2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_int3 simd_min(simd_int3 x, simd_int3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_int4 simd_min(simd_int4 x, simd_int4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_int8 simd_min(simd_int8 x, simd_int8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_int16 simd_min(simd_int16 x, simd_int16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uint2 simd_min(simd_uint2 x, simd_uint2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uint3 simd_min(simd_uint3 x, simd_uint3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uint4 simd_min(simd_uint4 x, simd_uint4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uint8 simd_min(simd_uint8 x, simd_uint8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_uint16 simd_min(simd_uint16 x, simd_uint16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC float simd_min(float x, float y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_float2 simd_min(simd_float2 x, simd_float2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_float3 simd_min(simd_float3 x, simd_float3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_float4 simd_min(simd_float4 x, simd_float4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_float8 simd_min(simd_float8 x, simd_float8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_float16 simd_min(simd_float16 x, simd_float16 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_long2 simd_min(simd_long2 x, simd_long2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_long3 simd_min(simd_long3 x, simd_long3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_long4 simd_min(simd_long4 x, simd_long4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_long8 simd_min(simd_long8 x, simd_long8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ulong2 simd_min(simd_ulong2 x, simd_ulong2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ulong3 simd_min(simd_ulong3 x, simd_ulong3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ulong4 simd_min(simd_ulong4 x, simd_ulong4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_ulong8 simd_min(simd_ulong8 x, simd_ulong8 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC double simd_min(double x, double y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_double2 simd_min(simd_double2 x, simd_double2 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_double3 simd_min(simd_double3 x, simd_double3 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_double4 simd_min(simd_double4 x, simd_double4 y);
+/*! @abstract The elementwise minimum of x and y. */
+static inline SIMD_CFUNC simd_double8 simd_min(simd_double8 x, simd_double8 y);
+/*! @abstract The elementwise minimum of x and y.
+ * @discussion Deprecated. Use simd_min(x,y) instead. */
+#define vector_min simd_min
+
+
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char2 simd_clamp(simd_char2 x, simd_char2 min, simd_char2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char3 simd_clamp(simd_char3 x, simd_char3 min, simd_char3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char4 simd_clamp(simd_char4 x, simd_char4 min, simd_char4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char8 simd_clamp(simd_char8 x, simd_char8 min, simd_char8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char16 simd_clamp(simd_char16 x, simd_char16 min, simd_char16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char32 simd_clamp(simd_char32 x, simd_char32 min, simd_char32 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_char64 simd_clamp(simd_char64 x, simd_char64 min, simd_char64 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar2 simd_clamp(simd_uchar2 x, simd_uchar2 min, simd_uchar2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar3 simd_clamp(simd_uchar3 x, simd_uchar3 min, simd_uchar3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar4 simd_clamp(simd_uchar4 x, simd_uchar4 min, simd_uchar4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar8 simd_clamp(simd_uchar8 x, simd_uchar8 min, simd_uchar8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar16 simd_clamp(simd_uchar16 x, simd_uchar16 min, simd_uchar16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar32 simd_clamp(simd_uchar32 x, simd_uchar32 min, simd_uchar32 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uchar64 simd_clamp(simd_uchar64 x, simd_uchar64 min, simd_uchar64 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_short2 simd_clamp(simd_short2 x, simd_short2 min, simd_short2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_short3 simd_clamp(simd_short3 x, simd_short3 min, simd_short3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_short4 simd_clamp(simd_short4 x, simd_short4 min, simd_short4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_short8 simd_clamp(simd_short8 x, simd_short8 min, simd_short8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_short16 simd_clamp(simd_short16 x, simd_short16 min, simd_short16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_short32 simd_clamp(simd_short32 x, simd_short32 min, simd_short32 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ushort2 simd_clamp(simd_ushort2 x, simd_ushort2 min, simd_ushort2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ushort3 simd_clamp(simd_ushort3 x, simd_ushort3 min, simd_ushort3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ushort4 simd_clamp(simd_ushort4 x, simd_ushort4 min, simd_ushort4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ushort8 simd_clamp(simd_ushort8 x, simd_ushort8 min, simd_ushort8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ushort16 simd_clamp(simd_ushort16 x, simd_ushort16 min, simd_ushort16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ushort32 simd_clamp(simd_ushort32 x, simd_ushort32 min, simd_ushort32 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_int2 simd_clamp(simd_int2 x, simd_int2 min, simd_int2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_int3 simd_clamp(simd_int3 x, simd_int3 min, simd_int3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_int4 simd_clamp(simd_int4 x, simd_int4 min, simd_int4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_int8 simd_clamp(simd_int8 x, simd_int8 min, simd_int8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_int16 simd_clamp(simd_int16 x, simd_int16 min, simd_int16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uint2 simd_clamp(simd_uint2 x, simd_uint2 min, simd_uint2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uint3 simd_clamp(simd_uint3 x, simd_uint3 min, simd_uint3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uint4 simd_clamp(simd_uint4 x, simd_uint4 min, simd_uint4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uint8 simd_clamp(simd_uint8 x, simd_uint8 min, simd_uint8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_uint16 simd_clamp(simd_uint16 x, simd_uint16 min, simd_uint16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC float simd_clamp(float x, float min, float max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_float2 simd_clamp(simd_float2 x, simd_float2 min, simd_float2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_float3 simd_clamp(simd_float3 x, simd_float3 min, simd_float3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_float4 simd_clamp(simd_float4 x, simd_float4 min, simd_float4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_float8 simd_clamp(simd_float8 x, simd_float8 min, simd_float8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_float16 simd_clamp(simd_float16 x, simd_float16 min, simd_float16 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_long2 simd_clamp(simd_long2 x, simd_long2 min, simd_long2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_long3 simd_clamp(simd_long3 x, simd_long3 min, simd_long3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_long4 simd_clamp(simd_long4 x, simd_long4 min, simd_long4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_long8 simd_clamp(simd_long8 x, simd_long8 min, simd_long8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ulong2 simd_clamp(simd_ulong2 x, simd_ulong2 min, simd_ulong2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ulong3 simd_clamp(simd_ulong3 x, simd_ulong3 min, simd_ulong3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ulong4 simd_clamp(simd_ulong4 x, simd_ulong4 min, simd_ulong4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_ulong8 simd_clamp(simd_ulong8 x, simd_ulong8 min, simd_ulong8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC double simd_clamp(double x, double min, double max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_double2 simd_clamp(simd_double2 x, simd_double2 min, simd_double2 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_double3 simd_clamp(simd_double3 x, simd_double3 min, simd_double3 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_double4 simd_clamp(simd_double4 x, simd_double4 min, simd_double4 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Note that if you want to clamp all lanes to the same range,
+ * you can use a scalar value for min and max. */
+static inline SIMD_CFUNC simd_double8 simd_clamp(simd_double8 x, simd_double8 min, simd_double8 max);
+/*! @abstract x clamped to the range [min, max].
+ * @discussion Deprecated. Use simd_clamp(x,min,max) instead. */
+#define vector_clamp simd_clamp
+
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC float simd_sign(float x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_float2 simd_sign(simd_float2 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_float3 simd_sign(simd_float3 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_float4 simd_sign(simd_float4 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_float8 simd_sign(simd_float8 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_float16 simd_sign(simd_float16 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC double simd_sign(double x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_double2 simd_sign(simd_double2 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_double3 simd_sign(simd_double3 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_double4 simd_sign(simd_double4 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise. */
+static inline SIMD_CFUNC simd_double8 simd_sign(simd_double8 x);
+/*! @abstract -1 if x is negative, +1 if x is positive, and 0 otherwise.
+ * @discussion Deprecated. Use simd_sign(x) instead. */
+#define vector_sign simd_sign
+
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC float simd_mix(float x, float y, float t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_float2 simd_mix(simd_float2 x, simd_float2 y, simd_float2 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_float3 simd_mix(simd_float3 x, simd_float3 y, simd_float3 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_float4 simd_mix(simd_float4 x, simd_float4 y, simd_float4 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_float8 simd_mix(simd_float8 x, simd_float8 y, simd_float8 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_float16 simd_mix(simd_float16 x, simd_float16 y, simd_float16 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC double simd_mix(double x, double y, double t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_double2 simd_mix(simd_double2 x, simd_double2 y, simd_double2 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_double3 simd_mix(simd_double3 x, simd_double3 y, simd_double3 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_double4 simd_mix(simd_double4 x, simd_double4 y, simd_double4 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1 */
+static inline SIMD_CFUNC simd_double8 simd_mix(simd_double8 x, simd_double8 y, simd_double8 t);
+/*! @abstract Linearly interpolates between x and y, taking the value x when
+ * t=0 and y when t=1
+ * @discussion Deprecated. Use simd_mix(x, y, t) instead. */
+#define vector_mix simd_mix
+
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC float simd_precise_recip(float x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_float2 simd_precise_recip(simd_float2 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_float3 simd_precise_recip(simd_float3 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_float4 simd_precise_recip(simd_float4 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_float8 simd_precise_recip(simd_float8 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_float16 simd_precise_recip(simd_float16 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC double simd_precise_recip(double x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_double2 simd_precise_recip(simd_double2 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_double3 simd_precise_recip(simd_double3 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_double4 simd_precise_recip(simd_double4 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * a few units in the last place (ULPs). */
+static inline SIMD_CFUNC simd_double8 simd_precise_recip(simd_double8 x);
+/*! @abstract A good approximation to 1/x.
+ * @discussion Deprecated. Use simd_precise_recip(x) instead. */
+#define vector_precise_recip simd_precise_recip
+
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC float simd_fast_recip(float x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_float2 simd_fast_recip(simd_float2 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_float3 simd_fast_recip(simd_float3 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_float4 simd_fast_recip(simd_float4 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_float8 simd_fast_recip(simd_float8 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_float16 simd_fast_recip(simd_float16 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC double simd_fast_recip(double x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_double2 simd_fast_recip(simd_double2 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_double3 simd_fast_recip(simd_double3 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_double4 simd_fast_recip(simd_double4 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow; otherwise this function is accurate to
+ * at least 11 bits for float and 22 bits for double. */
+static inline SIMD_CFUNC simd_double8 simd_fast_recip(simd_double8 x);
+/*! @abstract A fast approximation to 1/x.
+ * @discussion Deprecated. Use simd_fast_recip(x) instead. */
+#define vector_fast_recip simd_fast_recip
+
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC float simd_recip(float x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float2 simd_recip(simd_float2 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float3 simd_recip(simd_float3 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float4 simd_recip(simd_float4 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float8 simd_recip(simd_float8 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float16 simd_recip(simd_float16 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC double simd_recip(double x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double2 simd_recip(simd_double2 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double3 simd_recip(simd_double3 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double4 simd_recip(simd_double4 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion If x is very close to the limits of representation, the
+ * result may overflow or underflow. This function maps to
+ * simd_fast_recip(x) if -ffast-math is specified, and to
+ * simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double8 simd_recip(simd_double8 x);
+/*! @abstract An approximation to 1/x.
+ * @discussion Deprecated. Use simd_recip(x) instead. */
+#define vector_recip simd_recip
+
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC float simd_precise_rsqrt(float x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_float2 simd_precise_rsqrt(simd_float2 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_float3 simd_precise_rsqrt(simd_float3 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_float4 simd_precise_rsqrt(simd_float4 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_float8 simd_precise_rsqrt(simd_float8 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_float16 simd_precise_rsqrt(simd_float16 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC double simd_precise_rsqrt(double x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_double2 simd_precise_rsqrt(simd_double2 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_double3 simd_precise_rsqrt(simd_double3 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_double4 simd_precise_rsqrt(simd_double4 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion This function is accurate to a few units in the last place
+ * (ULPs). */
+static inline SIMD_CFUNC simd_double8 simd_precise_rsqrt(simd_double8 x);
+/*! @abstract A good approximation to 1/sqrt(x).
+ * @discussion Deprecated. Use simd_precise_rsqrt(x) instead. */
+#define vector_precise_rsqrt simd_precise_rsqrt
+
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC float simd_fast_rsqrt(float x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_float2 simd_fast_rsqrt(simd_float2 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_float3 simd_fast_rsqrt(simd_float3 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_float4 simd_fast_rsqrt(simd_float4 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_float8 simd_fast_rsqrt(simd_float8 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_float16 simd_fast_rsqrt(simd_float16 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC double simd_fast_rsqrt(double x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_double2 simd_fast_rsqrt(simd_double2 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_double3 simd_fast_rsqrt(simd_double3 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_double4 simd_fast_rsqrt(simd_double4 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion This function is accurate to at least 11 bits for float and
+ * 22 bits for double. */
+static inline SIMD_CFUNC simd_double8 simd_fast_rsqrt(simd_double8 x);
+/*! @abstract A fast approximation to 1/sqrt(x).
+ * @discussion Deprecated. Use simd_fast_rsqrt(x) instead. */
+#define vector_fast_rsqrt simd_fast_rsqrt
+
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC float simd_rsqrt(float x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float2 simd_rsqrt(simd_float2 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float3 simd_rsqrt(simd_float3 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float4 simd_rsqrt(simd_float4 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float8 simd_rsqrt(simd_float8 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_float16 simd_rsqrt(simd_float16 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC double simd_rsqrt(double x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double2 simd_rsqrt(simd_double2 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double3 simd_rsqrt(simd_double3 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double4 simd_rsqrt(simd_double4 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion This function maps to simd_fast_recip(x) if -ffast-math is
+ * specified, and to simd_precise_recip(x) otherwise. */
+static inline SIMD_CFUNC simd_double8 simd_rsqrt(simd_double8 x);
+/*! @abstract An approximation to 1/sqrt(x).
+ * @discussion Deprecated. Use simd_rsqrt(x) instead. */
+#define vector_rsqrt simd_rsqrt
+
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC float simd_fract(float x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_float2 simd_fract(simd_float2 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_float3 simd_fract(simd_float3 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_float4 simd_fract(simd_float4 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_float8 simd_fract(simd_float8 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_float16 simd_fract(simd_float16 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC double simd_fract(double x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_double2 simd_fract(simd_double2 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_double3 simd_fract(simd_double3 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_double4 simd_fract(simd_double4 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion floor(x) + fract(x) is *approximately* equal to x. If x is
+ * positive and finite, then the two values are exactly equal. */
+static inline SIMD_CFUNC simd_double8 simd_fract(simd_double8 x);
+/*! @abstract The "fractional part" of x, lying in the range [0, 1).
+ * @discussion Deprecated. Use simd_fract(x) instead. */
+#define vector_fract simd_fract
+
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC float simd_step(float edge, float x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_float2 simd_step(simd_float2 edge, simd_float2 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_float3 simd_step(simd_float3 edge, simd_float3 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_float4 simd_step(simd_float4 edge, simd_float4 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_float8 simd_step(simd_float8 edge, simd_float8 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_float16 simd_step(simd_float16 edge, simd_float16 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC double simd_step(double edge, double x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_double2 simd_step(simd_double2 edge, simd_double2 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_double3 simd_step(simd_double3 edge, simd_double3 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_double4 simd_step(simd_double4 edge, simd_double4 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Use a scalar value for edge if you want to apply the same
+ * threshold to all lanes. */
+static inline SIMD_CFUNC simd_double8 simd_step(simd_double8 edge, simd_double8 x);
+/*! @abstract 0 if x < edge, and 1 otherwise.
+ * @discussion Deprecated. Use simd_step(edge, x) instead. */
+#define vector_step simd_step
+
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC float simd_smoothstep(float edge0, float edge1, float x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_float2 simd_smoothstep(simd_float2 edge0, simd_float2 edge1, simd_float2 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_float3 simd_smoothstep(simd_float3 edge0, simd_float3 edge1, simd_float3 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_float4 simd_smoothstep(simd_float4 edge0, simd_float4 edge1, simd_float4 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_float8 simd_smoothstep(simd_float8 edge0, simd_float8 edge1, simd_float8 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_float16 simd_smoothstep(simd_float16 edge0, simd_float16 edge1, simd_float16 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC double simd_smoothstep(double edge0, double edge1, double x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_double2 simd_smoothstep(simd_double2 edge0, simd_double2 edge1, simd_double2 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_double3 simd_smoothstep(simd_double3 edge0, simd_double3 edge1, simd_double3 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_double4 simd_smoothstep(simd_double4 edge0, simd_double4 edge1, simd_double4 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion You can use a scalar value for edge0 and edge1 if you want
+ * to clamp all lanes at the same points. */
+static inline SIMD_CFUNC simd_double8 simd_smoothstep(simd_double8 edge0, simd_double8 edge1, simd_double8 x);
+/*! @abstract Interpolates smoothly between 0 at edge0 and 1 at edge1
+ * @discussion Deprecated. Use simd_smoothstep(edge0, edge1, x) instead. */
+#define vector_smoothstep simd_smoothstep
+
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char32 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC char simd_reduce_add(simd_char64 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar32 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar64 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC short simd_reduce_add(simd_short32 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort32 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC int simd_reduce_add(simd_int16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC float simd_reduce_add(simd_float16 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double2 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double3 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double4 x);
+/*! @abstract Sum of elements in x.
+ * @discussion This computation may overflow; especial for 8-bit types you
+ * may need to convert to a wider type before reducing. */
+static inline SIMD_CFUNC double simd_reduce_add(simd_double8 x);
+/*! @abstract Sum of elements in x.
+ * @discussion Deprecated. Use simd_add(x) instead. */
+#define vector_reduce_add simd_reduce_add
+
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char32 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_min(simd_char64 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar32 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar64 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_min(simd_short32 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort32 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_min(simd_int16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_min(simd_float16 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong8 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double2 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double3 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double4 x);
+/*! @abstract Minimum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_min(simd_double8 x);
+/*! @abstract Minimum of elements in x.
+ * @discussion Deprecated. Use simd_min(x) instead. */
+#define vector_reduce_min simd_reduce_min
+
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char32 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC char simd_reduce_max(simd_char64 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar32 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar64 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC short simd_reduce_max(simd_short32 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort32 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC int simd_reduce_max(simd_int16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC float simd_reduce_max(simd_float16 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong8 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double2 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double3 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double4 x);
+/*! @abstract Maximum of elements in x. */
+static inline SIMD_CFUNC double simd_reduce_max(simd_double8 x);
+/*! @abstract Maximum of elements in x.
+ * @discussion Deprecated. Use simd_max(x) instead. */
+#define vector_reduce_max simd_reduce_max
+
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char2 x, simd_char2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char3 x, simd_char3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char4 x, simd_char4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char8 x, simd_char8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char16 x, simd_char16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char32 x, simd_char32 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_char64 x, simd_char64 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar2 x, simd_uchar2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar3 x, simd_uchar3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar4 x, simd_uchar4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar8 x, simd_uchar8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar16 x, simd_uchar16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar32 x, simd_uchar32 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uchar64 x, simd_uchar64 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short2 x, simd_short2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short3 x, simd_short3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short4 x, simd_short4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short8 x, simd_short8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short16 x, simd_short16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_short32 x, simd_short32 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort2 x, simd_ushort2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort3 x, simd_ushort3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort4 x, simd_ushort4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort8 x, simd_ushort8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort16 x, simd_ushort16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ushort32 x, simd_ushort32 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int2 x, simd_int2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int3 x, simd_int3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int4 x, simd_int4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int8 x, simd_int8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_int16 x, simd_int16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint2 x, simd_uint2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint3 x, simd_uint3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint4 x, simd_uint4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint8 x, simd_uint8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_uint16 x, simd_uint16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float2 x, simd_float2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float3 x, simd_float3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float4 x, simd_float4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float8 x, simd_float8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_float16 x, simd_float16 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long2 x, simd_long2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long3 x, simd_long3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long4 x, simd_long4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_long8 x, simd_long8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong2 x, simd_ulong2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong3 x, simd_ulong3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong4 x, simd_ulong4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_ulong8 x, simd_ulong8 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double2 x, simd_double2 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double3 x, simd_double3 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double4 x, simd_double4 y) {
+ return simd_all(x == y);
+}
+/*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y. */
+static inline SIMD_CFUNC simd_bool simd_equal(simd_double8 x, simd_double8 y) {
+ return simd_all(x == y);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+ /*! @abstract The lanewise absolute value of x. */
+ template <typename typeN> static SIMD_CPPFUNC typeN abs(const typeN x) { return ::simd_abs(x); }
+ /*! @abstract The lanewise maximum of x and y. */
+ template <typename typeN> static SIMD_CPPFUNC typeN max(const typeN x, const typeN y) { return ::simd_max(x,y); }
+ /*! @abstract The lanewise minimum of x and y. */
+ template <typename typeN> static SIMD_CPPFUNC typeN min(const typeN x, const typeN y) { return ::simd_min(x,y); }
+ /*! @abstract x clamped to the interval [min, max]. */
+ template <typename typeN> static SIMD_CPPFUNC typeN clamp(const typeN x, const typeN min, const typeN max) { return ::simd_clamp(x,min,max); }
+ /*! @abstract -1 if x < 0, +1 if x > 0, and 0 otherwise. */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN sign(const fptypeN x) { return ::simd_sign(x); }
+ /*! @abstract Linearly interpolates between x and y, taking the value x when t=0 and y when t=1 */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN mix(const fptypeN x, const fptypeN y, const fptypeN t) { return ::simd_mix(x,y,t); }
+ /*! @abstract An approximation to 1/x. */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return simd_recip(x); }
+ /*! @abstract An approximation to 1/sqrt(x). */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return simd_rsqrt(x); }
+ /*! @abstract The "fracional part" of x, in the range [0,1). */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN fract(const fptypeN x) { return ::simd_fract(x); }
+ /*! @abstract 0 if x < edge, 1 otherwise. */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN step(const fptypeN edge, const fptypeN x) { return ::simd_step(edge,x); }
+ /*! @abstract smoothly interpolates from 0 at edge0 to 1 at edge1. */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN smoothstep(const fptypeN edge0, const fptypeN edge1, const fptypeN x) { return ::simd_smoothstep(edge0,edge1,x); }
+ /*! @abstract True if and only if each lane of x is equal to the
+ * corresponding lane of y.
+ *
+ * @discussion This isn't operator== because that's already defined by
+ * the compiler to return a lane mask. */
+ template <typename fptypeN> static SIMD_CPPFUNC simd_bool equal(const fptypeN x, const fptypeN y) { return ::simd_equal(x, y); }
+#if __cpp_decltype_auto
+ /* If you are targeting an earlier version of the C++ standard that lacks
+ decltype_auto support, you may use the C-style simd_reduce_* functions
+ instead. */
+ /*! @abstract The sum of the elements in x. May overflow. */
+ template <typename typeN> static SIMD_CPPFUNC auto reduce_add(typeN x) { return ::simd_reduce_add(x); }
+ /*! @abstract The least element in x. */
+ template <typename typeN> static SIMD_CPPFUNC auto reduce_min(typeN x) { return ::simd_reduce_min(x); }
+ /*! @abstract The greatest element in x. */
+ template <typename typeN> static SIMD_CPPFUNC auto reduce_max(typeN x) { return ::simd_reduce_max(x); }
+#endif
+ namespace precise {
+ /*! @abstract An approximation to 1/x. */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return ::simd_precise_recip(x); }
+ /*! @abstract An approximation to 1/sqrt(x). */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return ::simd_precise_rsqrt(x); }
+ }
+ namespace fast {
+ /*! @abstract An approximation to 1/x. */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN recip(const fptypeN x) { return ::simd_fast_recip(x); }
+ /*! @abstract An approximation to 1/sqrt(x). */
+ template <typename fptypeN> static SIMD_CPPFUNC fptypeN rsqrt(const fptypeN x) { return ::simd_fast_rsqrt(x); }
+ }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementation
+
+static inline SIMD_CFUNC simd_char2 simd_abs(simd_char2 x) {
+ return simd_make_char2(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_abs(simd_char3 x) {
+ return simd_make_char3(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_abs(simd_char4 x) {
+ return simd_make_char4(simd_abs(simd_make_char8_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_abs(simd_char8 x) {
+#if defined __arm__ || defined __arm64__
+ return vabs_s8(x);
+#else
+ return simd_make_char8(simd_abs(simd_make_char16_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_char16 simd_abs(simd_char16 x) {
+#if defined __arm__ || defined __arm64__
+ return vabsq_s8(x);
+#elif defined __SSE4_1__
+ return (simd_char16) _mm_abs_epi8((__m128i)x);
+#else
+ simd_char16 mask = x >> 7; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_abs(simd_char32 x) {
+#if defined __AVX2__
+ return _mm256_abs_epi8(x);
+#else
+ return simd_make_char32(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_abs(simd_char64 x) {
+#if defined __AVX512BW__
+ return _mm512_abs_epi8(x);
+#else
+ return simd_make_char64(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_abs(simd_short2 x) {
+ return simd_make_short2(simd_abs(simd_make_short4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_abs(simd_short3 x) {
+ return simd_make_short3(simd_abs(simd_make_short4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_abs(simd_short4 x) {
+#if defined __arm__ || defined __arm64__
+ return vabs_s16(x);
+#else
+ return simd_make_short4(simd_abs(simd_make_short8_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short8 simd_abs(simd_short8 x) {
+#if defined __arm__ || defined __arm64__
+ return vabsq_s16(x);
+#elif defined __SSE4_1__
+ return (simd_short8) _mm_abs_epi16((__m128i)x);
+#else
+ simd_short8 mask = x >> 15; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_abs(simd_short16 x) {
+#if defined __AVX2__
+ return _mm256_abs_epi16(x);
+#else
+ return simd_make_short16(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_abs(simd_short32 x) {
+#if defined __AVX512BW__
+ return _mm512_abs_epi16(x);
+#else
+ return simd_make_short32(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_abs(simd_int2 x) {
+#if defined __arm__ || defined __arm64__
+ return vabs_s32(x);
+#else
+ return simd_make_int2(simd_abs(simd_make_int4_undef(x)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int3 simd_abs(simd_int3 x) {
+ return simd_make_int3(simd_abs(simd_make_int4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_abs(simd_int4 x) {
+#if defined __arm__ || defined __arm64__
+ return vabsq_s32(x);
+#elif defined __SSE4_1__
+ return (simd_int4) _mm_abs_epi32((__m128i)x);
+#else
+ simd_int4 mask = x >> 31; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_abs(simd_int8 x) {
+#if defined __AVX2__
+ return _mm256_abs_epi32(x);
+#else
+ return simd_make_int8(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_abs(simd_int16 x) {
+#if defined __AVX512F__
+ return _mm512_abs_epi32(x);
+#else
+ return simd_make_int16(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_abs(simd_float2 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_abs(simd_float3 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_abs(simd_float4 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_abs(simd_float8 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_abs(simd_float16 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_abs(simd_long2 x) {
+#if defined __arm64__
+ return vabsq_s64(x);
+#elif defined __SSE4_1__
+ return (simd_long2) _mm_abs_epi64((__m128i)x);
+#else
+ simd_long2 mask = x >> 63; return (x ^ mask) - mask;
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_abs(simd_long3 x) {
+ return simd_make_long3(simd_abs(simd_make_long4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_abs(simd_long4 x) {
+#if defined __AVX2__
+ return _mm256_abs_epi64(x);
+#else
+ return simd_make_long4(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_abs(simd_long8 x) {
+#if defined __AVX512F__
+ return _mm512_abs_epi64(x);
+#else
+ return simd_make_long8(simd_abs(x.lo), simd_abs(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_abs(simd_double2 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_abs(simd_double3 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_abs(simd_double4 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_abs(simd_double8 x) {
+ return __tg_fabs(x);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_min(simd_char2 x, simd_char2 y) {
+ return simd_make_char2(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_min(simd_char3 x, simd_char3 y) {
+ return simd_make_char3(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_min(simd_char4 x, simd_char4 y) {
+ return simd_make_char4(simd_min(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_min(simd_char8 x, simd_char8 y) {
+#if defined __arm__ || defined __arm64__
+ return vmin_s8(x, y);
+#else
+ return simd_make_char8(simd_min(simd_make_char16_undef(x), simd_make_char16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_char16 simd_min(simd_char16 x, simd_char16 y) {
+#if defined __arm__ || defined __arm64__
+ return vminq_s8(x, y);
+#elif defined __SSE4_1__
+ return (simd_char16) _mm_min_epi8((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_min(simd_char32 x, simd_char32 y) {
+#if defined __AVX2__
+ return _mm256_min_epi8(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_min(simd_char64 x, simd_char64 y) {
+#if defined __AVX512BW__
+ return _mm512_min_epi8(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_min(simd_uchar2 x, simd_uchar2 y) {
+ return simd_make_uchar2(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_min(simd_uchar3 x, simd_uchar3 y) {
+ return simd_make_uchar3(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_min(simd_uchar4 x, simd_uchar4 y) {
+ return simd_make_uchar4(simd_min(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_min(simd_uchar8 x, simd_uchar8 y) {
+#if defined __arm__ || defined __arm64__
+ return vmin_u8(x, y);
+#else
+ return simd_make_uchar8(simd_min(simd_make_uchar16_undef(x), simd_make_uchar16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_min(simd_uchar16 x, simd_uchar16 y) {
+#if defined __arm__ || defined __arm64__
+ return vminq_u8(x, y);
+#elif defined __SSE4_1__
+ return (simd_uchar16) _mm_min_epu8((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_min(simd_uchar32 x, simd_uchar32 y) {
+#if defined __AVX2__
+ return _mm256_min_epu8(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_min(simd_uchar64 x, simd_uchar64 y) {
+#if defined __AVX512BW__
+ return _mm512_min_epu8(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_min(simd_short2 x, simd_short2 y) {
+ return simd_make_short2(simd_min(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_min(simd_short3 x, simd_short3 y) {
+ return simd_make_short3(simd_min(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_min(simd_short4 x, simd_short4 y) {
+#if defined __arm__ || defined __arm64__
+ return vmin_s16(x, y);
+#else
+ return simd_make_short4(simd_min(simd_make_short8_undef(x), simd_make_short8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_short8 simd_min(simd_short8 x, simd_short8 y) {
+#if defined __arm__ || defined __arm64__
+ return vminq_s16(x, y);
+#elif defined __SSE4_1__
+ return (simd_short8) _mm_min_epi16((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_min(simd_short16 x, simd_short16 y) {
+#if defined __AVX2__
+ return _mm256_min_epi16(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_min(simd_short32 x, simd_short32 y) {
+#if defined __AVX512BW__
+ return _mm512_min_epi16(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_min(simd_ushort2 x, simd_ushort2 y) {
+ return simd_make_ushort2(simd_min(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_min(simd_ushort3 x, simd_ushort3 y) {
+ return simd_make_ushort3(simd_min(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_min(simd_ushort4 x, simd_ushort4 y) {
+#if defined __arm__ || defined __arm64__
+ return vmin_u16(x, y);
+#else
+ return simd_make_ushort4(simd_min(simd_make_ushort8_undef(x), simd_make_ushort8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_min(simd_ushort8 x, simd_ushort8 y) {
+#if defined __arm__ || defined __arm64__
+ return vminq_u16(x, y);
+#elif defined __SSE4_1__
+ return (simd_ushort8) _mm_min_epu16((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_min(simd_ushort16 x, simd_ushort16 y) {
+#if defined __AVX2__
+ return _mm256_min_epu16(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_min(simd_ushort32 x, simd_ushort32 y) {
+#if defined __AVX512BW__
+ return _mm512_min_epu16(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_min(simd_int2 x, simd_int2 y) {
+#if defined __arm__ || defined __arm64__
+ return vmin_s32(x, y);
+#else
+ return simd_make_int2(simd_min(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_int3 simd_min(simd_int3 x, simd_int3 y) {
+ return simd_make_int3(simd_min(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_min(simd_int4 x, simd_int4 y) {
+#if defined __arm__ || defined __arm64__
+ return vminq_s32(x, y);
+#elif defined __SSE4_1__
+ return (simd_int4) _mm_min_epi32((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_min(simd_int8 x, simd_int8 y) {
+#if defined __AVX2__
+ return _mm256_min_epi32(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_min(simd_int16 x, simd_int16 y) {
+#if defined __AVX512F__
+ return _mm512_min_epi32(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_min(simd_uint2 x, simd_uint2 y) {
+#if defined __arm__ || defined __arm64__
+ return vmin_u32(x, y);
+#else
+ return simd_make_uint2(simd_min(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_min(simd_uint3 x, simd_uint3 y) {
+ return simd_make_uint3(simd_min(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_min(simd_uint4 x, simd_uint4 y) {
+#if defined __arm__ || defined __arm64__
+ return vminq_u32(x, y);
+#elif defined __SSE4_1__
+ return (simd_uint4) _mm_min_epu32((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_min(simd_uint8 x, simd_uint8 y) {
+#if defined __AVX2__
+ return _mm256_min_epu32(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_min(simd_uint16 x, simd_uint16 y) {
+#if defined __AVX512F__
+ return _mm512_min_epu32(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_min(float x, float y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_min(simd_float2 x, simd_float2 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_min(simd_float3 x, simd_float3 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_min(simd_float4 x, simd_float4 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_min(simd_float8 x, simd_float8 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_min(simd_float16 x, simd_float16 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_min(simd_long2 x, simd_long2 y) {
+#if defined __AVX512VL__
+ return _mm_min_epi64(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_min(simd_long3 x, simd_long3 y) {
+ return simd_make_long3(simd_min(simd_make_long4_undef(x), simd_make_long4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_min(simd_long4 x, simd_long4 y) {
+#if defined __AVX512VL__
+ return _mm256_min_epi64(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_min(simd_long8 x, simd_long8 y) {
+#if defined __AVX512F__
+ return _mm512_min_epi64(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_min(simd_ulong2 x, simd_ulong2 y) {
+#if defined __AVX512VL__
+ return _mm_min_epu64(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_min(simd_ulong3 x, simd_ulong3 y) {
+ return simd_make_ulong3(simd_min(simd_make_ulong4_undef(x), simd_make_ulong4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_min(simd_ulong4 x, simd_ulong4 y) {
+#if defined __AVX512VL__
+ return _mm256_min_epu64(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_min(simd_ulong8 x, simd_ulong8 y) {
+#if defined __AVX512F__
+ return _mm512_min_epu64(x, y);
+#else
+ return simd_bitselect(x, y, y < x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_min(double x, double y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_min(simd_double2 x, simd_double2 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_min(simd_double3 x, simd_double3 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_min(simd_double4 x, simd_double4 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_min(simd_double8 x, simd_double8 y) {
+ return __tg_fmin(x,y);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_max(simd_char2 x, simd_char2 y) {
+ return simd_make_char2(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char3 simd_max(simd_char3 x, simd_char3 y) {
+ return simd_make_char3(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char4 simd_max(simd_char4 x, simd_char4 y) {
+ return simd_make_char4(simd_max(simd_make_char8_undef(x), simd_make_char8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_char8 simd_max(simd_char8 x, simd_char8 y) {
+#if defined __arm__ || defined __arm64__
+ return vmax_s8(x, y);
+#else
+ return simd_make_char8(simd_max(simd_make_char16_undef(x), simd_make_char16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_char16 simd_max(simd_char16 x, simd_char16 y) {
+#if defined __arm__ || defined __arm64__
+ return vmaxq_s8(x, y);
+#elif defined __SSE4_1__
+ return (simd_char16) _mm_max_epi8((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char32 simd_max(simd_char32 x, simd_char32 y) {
+#if defined __AVX2__
+ return _mm256_max_epi8(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_char64 simd_max(simd_char64 x, simd_char64 y) {
+#if defined __AVX512BW__
+ return _mm512_max_epi8(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_max(simd_uchar2 x, simd_uchar2 y) {
+ return simd_make_uchar2(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_max(simd_uchar3 x, simd_uchar3 y) {
+ return simd_make_uchar3(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_max(simd_uchar4 x, simd_uchar4 y) {
+ return simd_make_uchar4(simd_max(simd_make_uchar8_undef(x), simd_make_uchar8_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_max(simd_uchar8 x, simd_uchar8 y) {
+#if defined __arm__ || defined __arm64__
+ return vmax_u8(x, y);
+#else
+ return simd_make_uchar8(simd_max(simd_make_uchar16_undef(x), simd_make_uchar16_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_max(simd_uchar16 x, simd_uchar16 y) {
+#if defined __arm__ || defined __arm64__
+ return vmaxq_u8(x, y);
+#elif defined __SSE4_1__
+ return (simd_uchar16) _mm_max_epu8((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_max(simd_uchar32 x, simd_uchar32 y) {
+#if defined __AVX2__
+ return _mm256_max_epu8(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_max(simd_uchar64 x, simd_uchar64 y) {
+#if defined __AVX512BW__
+ return _mm512_max_epu8(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short2 simd_max(simd_short2 x, simd_short2 y) {
+ return simd_make_short2(simd_max(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short3 simd_max(simd_short3 x, simd_short3 y) {
+ return simd_make_short3(simd_max(simd_make_short4_undef(x), simd_make_short4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_short4 simd_max(simd_short4 x, simd_short4 y) {
+#if defined __arm__ || defined __arm64__
+ return vmax_s16(x, y);
+#else
+ return simd_make_short4(simd_max(simd_make_short8_undef(x), simd_make_short8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_short8 simd_max(simd_short8 x, simd_short8 y) {
+#if defined __arm__ || defined __arm64__
+ return vmaxq_s16(x, y);
+#elif defined __SSE4_1__
+ return (simd_short8) _mm_max_epi16((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short16 simd_max(simd_short16 x, simd_short16 y) {
+#if defined __AVX2__
+ return _mm256_max_epi16(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_short32 simd_max(simd_short32 x, simd_short32 y) {
+#if defined __AVX512BW__
+ return _mm512_max_epi16(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_max(simd_ushort2 x, simd_ushort2 y) {
+ return simd_make_ushort2(simd_max(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_max(simd_ushort3 x, simd_ushort3 y) {
+ return simd_make_ushort3(simd_max(simd_make_ushort4_undef(x), simd_make_ushort4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_max(simd_ushort4 x, simd_ushort4 y) {
+#if defined __arm__ || defined __arm64__
+ return vmax_u16(x, y);
+#else
+ return simd_make_ushort4(simd_max(simd_make_ushort8_undef(x), simd_make_ushort8_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_max(simd_ushort8 x, simd_ushort8 y) {
+#if defined __arm__ || defined __arm64__
+ return vmaxq_u16(x, y);
+#elif defined __SSE4_1__
+ return (simd_ushort8) _mm_max_epu16((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_max(simd_ushort16 x, simd_ushort16 y) {
+#if defined __AVX2__
+ return _mm256_max_epu16(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_max(simd_ushort32 x, simd_ushort32 y) {
+#if defined __AVX512BW__
+ return _mm512_max_epu16(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int2 simd_max(simd_int2 x, simd_int2 y) {
+#if defined __arm__ || defined __arm64__
+ return vmax_s32(x, y);
+#else
+ return simd_make_int2(simd_max(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_int3 simd_max(simd_int3 x, simd_int3 y) {
+ return simd_make_int3(simd_max(simd_make_int4_undef(x), simd_make_int4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_int4 simd_max(simd_int4 x, simd_int4 y) {
+#if defined __arm__ || defined __arm64__
+ return vmaxq_s32(x, y);
+#elif defined __SSE4_1__
+ return (simd_int4) _mm_max_epi32((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int8 simd_max(simd_int8 x, simd_int8 y) {
+#if defined __AVX2__
+ return _mm256_max_epi32(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_int16 simd_max(simd_int16 x, simd_int16 y) {
+#if defined __AVX512F__
+ return _mm512_max_epi32(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_max(simd_uint2 x, simd_uint2 y) {
+#if defined __arm__ || defined __arm64__
+ return vmax_u32(x, y);
+#else
+ return simd_make_uint2(simd_max(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+#endif
+
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_max(simd_uint3 x, simd_uint3 y) {
+ return simd_make_uint3(simd_max(simd_make_uint4_undef(x), simd_make_uint4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_max(simd_uint4 x, simd_uint4 y) {
+#if defined __arm__ || defined __arm64__
+ return vmaxq_u32(x, y);
+#elif defined __SSE4_1__
+ return (simd_uint4) _mm_max_epu32((__m128i)x, (__m128i)y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_max(simd_uint8 x, simd_uint8 y) {
+#if defined __AVX2__
+ return _mm256_max_epu32(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_max(simd_uint16 x, simd_uint16 y) {
+#if defined __AVX512F__
+ return _mm512_max_epu32(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_max(float x, float y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_max(simd_float2 x, simd_float2 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_max(simd_float3 x, simd_float3 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_max(simd_float4 x, simd_float4 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_max(simd_float8 x, simd_float8 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_max(simd_float16 x, simd_float16 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_max(simd_long2 x, simd_long2 y) {
+#if defined __AVX512VL__
+ return _mm_max_epi64(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long3 simd_max(simd_long3 x, simd_long3 y) {
+ return simd_make_long3(simd_max(simd_make_long4_undef(x), simd_make_long4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_long4 simd_max(simd_long4 x, simd_long4 y) {
+#if defined __AVX512VL__
+ return _mm256_max_epi64(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_long8 simd_max(simd_long8 x, simd_long8 y) {
+#if defined __AVX512F__
+ return _mm512_max_epi64(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_max(simd_ulong2 x, simd_ulong2 y) {
+#if defined __AVX512VL__
+ return _mm_max_epu64(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_max(simd_ulong3 x, simd_ulong3 y) {
+ return simd_make_ulong3(simd_max(simd_make_ulong4_undef(x), simd_make_ulong4_undef(y)));
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_max(simd_ulong4 x, simd_ulong4 y) {
+#if defined __AVX512VL__
+ return _mm256_max_epu64(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_max(simd_ulong8 x, simd_ulong8 y) {
+#if defined __AVX512F__
+ return _mm512_max_epu64(x, y);
+#else
+ return simd_bitselect(x, y, x < y);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_max(double x, double y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_max(simd_double2 x, simd_double2 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_max(simd_double3 x, simd_double3 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_max(simd_double4 x, simd_double4 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_max(simd_double8 x, simd_double8 y) {
+ return __tg_fmax(x,y);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_clamp(simd_char2 x, simd_char2 min, simd_char2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char3 simd_clamp(simd_char3 x, simd_char3 min, simd_char3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char4 simd_clamp(simd_char4 x, simd_char4 min, simd_char4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char8 simd_clamp(simd_char8 x, simd_char8 min, simd_char8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char16 simd_clamp(simd_char16 x, simd_char16 min, simd_char16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char32 simd_clamp(simd_char32 x, simd_char32 min, simd_char32 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_char64 simd_clamp(simd_char64 x, simd_char64 min, simd_char64 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar2 simd_clamp(simd_uchar2 x, simd_uchar2 min, simd_uchar2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar3 simd_clamp(simd_uchar3 x, simd_uchar3 min, simd_uchar3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar4 simd_clamp(simd_uchar4 x, simd_uchar4 min, simd_uchar4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar8 simd_clamp(simd_uchar8 x, simd_uchar8 min, simd_uchar8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar16 simd_clamp(simd_uchar16 x, simd_uchar16 min, simd_uchar16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar32 simd_clamp(simd_uchar32 x, simd_uchar32 min, simd_uchar32 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uchar64 simd_clamp(simd_uchar64 x, simd_uchar64 min, simd_uchar64 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short2 simd_clamp(simd_short2 x, simd_short2 min, simd_short2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short3 simd_clamp(simd_short3 x, simd_short3 min, simd_short3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short4 simd_clamp(simd_short4 x, simd_short4 min, simd_short4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short8 simd_clamp(simd_short8 x, simd_short8 min, simd_short8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short16 simd_clamp(simd_short16 x, simd_short16 min, simd_short16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_short32 simd_clamp(simd_short32 x, simd_short32 min, simd_short32 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort2 simd_clamp(simd_ushort2 x, simd_ushort2 min, simd_ushort2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort3 simd_clamp(simd_ushort3 x, simd_ushort3 min, simd_ushort3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort4 simd_clamp(simd_ushort4 x, simd_ushort4 min, simd_ushort4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort8 simd_clamp(simd_ushort8 x, simd_ushort8 min, simd_ushort8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort16 simd_clamp(simd_ushort16 x, simd_ushort16 min, simd_ushort16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ushort32 simd_clamp(simd_ushort32 x, simd_ushort32 min, simd_ushort32 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int2 simd_clamp(simd_int2 x, simd_int2 min, simd_int2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int3 simd_clamp(simd_int3 x, simd_int3 min, simd_int3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int4 simd_clamp(simd_int4 x, simd_int4 min, simd_int4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int8 simd_clamp(simd_int8 x, simd_int8 min, simd_int8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_int16 simd_clamp(simd_int16 x, simd_int16 min, simd_int16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint2 simd_clamp(simd_uint2 x, simd_uint2 min, simd_uint2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint3 simd_clamp(simd_uint3 x, simd_uint3 min, simd_uint3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint4 simd_clamp(simd_uint4 x, simd_uint4 min, simd_uint4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint8 simd_clamp(simd_uint8 x, simd_uint8 min, simd_uint8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_uint16 simd_clamp(simd_uint16 x, simd_uint16 min, simd_uint16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC float simd_clamp(float x, float min, float max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_clamp(simd_float2 x, simd_float2 min, simd_float2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_clamp(simd_float3 x, simd_float3 min, simd_float3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_clamp(simd_float4 x, simd_float4 min, simd_float4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_clamp(simd_float8 x, simd_float8 min, simd_float8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_clamp(simd_float16 x, simd_float16 min, simd_float16 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long2 simd_clamp(simd_long2 x, simd_long2 min, simd_long2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long3 simd_clamp(simd_long3 x, simd_long3 min, simd_long3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long4 simd_clamp(simd_long4 x, simd_long4 min, simd_long4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_long8 simd_clamp(simd_long8 x, simd_long8 min, simd_long8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong2 simd_clamp(simd_ulong2 x, simd_ulong2 min, simd_ulong2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong3 simd_clamp(simd_ulong3 x, simd_ulong3 min, simd_ulong3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong4 simd_clamp(simd_ulong4 x, simd_ulong4 min, simd_ulong4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_ulong8 simd_clamp(simd_ulong8 x, simd_ulong8 min, simd_ulong8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC double simd_clamp(double x, double min, double max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_clamp(simd_double2 x, simd_double2 min, simd_double2 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_clamp(simd_double3 x, simd_double3 min, simd_double3 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_clamp(simd_double4 x, simd_double4 min, simd_double4 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_clamp(simd_double8 x, simd_double8 min, simd_double8 max) {
+ return simd_min(simd_max(x, min), max);
+}
+
+
+static inline SIMD_CFUNC float simd_sign(float x) {
+ return (x == 0 | x != x) ? 0 : copysign(1,x);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_sign(simd_float2 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_sign(simd_float3 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_sign(simd_float4 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_sign(simd_float8 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_sign(simd_float16 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC double simd_sign(double x) {
+ return (x == 0 | x != x) ? 0 : copysign(1,x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_sign(simd_double2 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_sign(simd_double3 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_sign(simd_double4 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_sign(simd_double8 x) {
+ return simd_bitselect(__tg_copysign(1,x), 0, x == 0 | x != x);
+}
+
+static inline SIMD_CFUNC float simd_mix(float x, float y, float t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_mix(simd_float2 x, simd_float2 y, simd_float2 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_mix(simd_float3 x, simd_float3 y, simd_float3 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_mix(simd_float4 x, simd_float4 y, simd_float4 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_mix(simd_float8 x, simd_float8 y, simd_float8 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_mix(simd_float16 x, simd_float16 y, simd_float16 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC double simd_mix(double x, double y, double t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_mix(simd_double2 x, simd_double2 y, simd_double2 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_mix(simd_double3 x, simd_double3 y, simd_double3 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_mix(simd_double4 x, simd_double4 y, simd_double4 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_mix(simd_double8 x, simd_double8 y, simd_double8 t) {
+ return x + t*(y - x);
+}
+
+static inline SIMD_CFUNC float simd_recip(float x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_recip(simd_float2 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_recip(simd_float3 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float4 simd_recip(simd_float4 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_recip(simd_float8 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_recip(simd_float16 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_recip(double x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_recip(simd_double2 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 simd_recip(simd_double3 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double4 simd_recip(simd_double4 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 simd_recip(simd_double8 x) {
+#if __FAST_MATH__
+ return simd_fast_recip(x);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_fast_recip(float x) {
+#if defined __AVX512VL__
+ simd_float4 x4 = simd_make_float4(x);
+ return ((simd_float4)_mm_rcp14_ss(x4, x4)).x;
+#elif defined __SSE__
+ return ((simd_float4)_mm_rcp_ss(simd_make_float4(x))).x;
+#elif defined __ARM_NEON__
+ return simd_fast_recip(simd_make_float2_undef(x)).x;
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fast_recip(simd_float2 x) {
+#if defined __SSE__
+ return simd_make_float2(simd_fast_recip(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+ simd_float2 r = vrecpe_f32(x);
+ return r * vrecps_f32(x, r);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fast_recip(simd_float3 x) {
+ return simd_make_float3(simd_fast_recip(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fast_recip(simd_float4 x) {
+#if defined __AVX512VL__
+ return _mm_rcp14_ps(x);
+#elif defined __SSE__
+ return _mm_rcp_ps(x);
+#elif defined __ARM_NEON__
+ simd_float4 r = vrecpeq_f32(x);
+ return r * vrecpsq_f32(x, r);
+#else
+ return simd_precise_recip(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fast_recip(simd_float8 x) {
+#if defined __AVX512VL__
+ return _mm256_rcp14_ps(x);
+#elif defined __AVX__
+ return _mm256_rcp_ps(x);
+#else
+ return simd_make_float8(simd_fast_recip(x.lo), simd_fast_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fast_recip(simd_float16 x) {
+#if defined __AVX512F__
+ return _mm512_rcp14_ps(x);
+#else
+ return simd_make_float16(simd_fast_recip(x.lo), simd_fast_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_fast_recip(double x) {
+ return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fast_recip(simd_double2 x) {
+ return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fast_recip(simd_double3 x) {
+ return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fast_recip(simd_double4 x) {
+ return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fast_recip(simd_double8 x) {
+ return simd_precise_recip(x);
+}
+
+static inline SIMD_CFUNC float simd_precise_recip(float x) {
+#if defined __SSE__
+ float r = simd_fast_recip(x);
+ return r*(2 - (x == 0 ? -INFINITY : x)*r);
+#elif defined __ARM_NEON__
+ return simd_precise_recip(simd_make_float2_undef(x)).x;
+#else
+ return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_precise_recip(simd_float2 x) {
+#if defined __SSE__
+ return simd_make_float2(simd_precise_recip(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+ simd_float2 r = simd_fast_recip(x);
+ return r*vrecps_f32(x, r);
+#else
+ return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_precise_recip(simd_float3 x) {
+ return simd_make_float3(simd_precise_recip(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_precise_recip(simd_float4 x) {
+#if defined __SSE__
+ simd_float4 r = simd_fast_recip(x);
+ return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#elif defined __ARM_NEON__
+ simd_float4 r = simd_fast_recip(x);
+ return r*vrecpsq_f32(x, r);
+#else
+ return 1/x;
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_precise_recip(simd_float8 x) {
+#if defined __AVX__
+ simd_float8 r = simd_fast_recip(x);
+ return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#else
+ return simd_make_float8(simd_precise_recip(x.lo), simd_precise_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_precise_recip(simd_float16 x) {
+#if defined __AVX512F__
+ simd_float16 r = simd_fast_recip(x);
+ return r*(2 - simd_bitselect(x, -INFINITY, x == 0)*r);
+#else
+ return simd_make_float16(simd_precise_recip(x.lo), simd_precise_recip(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_precise_recip(double x) {
+ return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double2 simd_precise_recip(simd_double2 x) {
+ return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double3 simd_precise_recip(simd_double3 x) {
+ return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double4 simd_precise_recip(simd_double4 x) {
+ return 1/x;
+}
+
+static inline SIMD_CFUNC simd_double8 simd_precise_recip(simd_double8 x) {
+ return 1/x;
+}
+
+static inline SIMD_CFUNC float simd_rsqrt(float x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_rsqrt(simd_float2 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_rsqrt(simd_float3 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float4 simd_rsqrt(simd_float4 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_rsqrt(simd_float8 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_rsqrt(simd_float16 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC double simd_rsqrt(double x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double2 simd_rsqrt(simd_double2 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 simd_rsqrt(simd_double3 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double4 simd_rsqrt(simd_double4 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 simd_rsqrt(simd_double8 x) {
+#if __FAST_MATH__
+ return simd_fast_rsqrt(x);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC float simd_fast_rsqrt(float x) {
+#if defined __AVX512VL__
+ simd_float4 x4 = simd_make_float4(x);
+ return ((simd_float4)_mm_rsqrt14_ss(x4, x4)).x;
+#elif defined __SSE__
+ return ((simd_float4)_mm_rsqrt_ss(simd_make_float4(x))).x;
+#elif defined __ARM_NEON__
+ return simd_fast_rsqrt(simd_make_float2_undef(x)).x;
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fast_rsqrt(simd_float2 x) {
+#if defined __SSE__
+ return simd_make_float2(simd_fast_rsqrt(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+ simd_float2 r = vrsqrte_f32(x);
+ return r * vrsqrts_f32(x, r*r);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fast_rsqrt(simd_float3 x) {
+ return simd_make_float3(simd_fast_rsqrt(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fast_rsqrt(simd_float4 x) {
+#if defined __AVX512VL__
+ return _mm_rsqrt14_ps(x);
+#elif defined __SSE__
+ return _mm_rsqrt_ps(x);
+#elif defined __ARM_NEON__
+ simd_float4 r = vrsqrteq_f32(x);
+ return r * vrsqrtsq_f32(x, r*r);
+#else
+ return simd_precise_rsqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fast_rsqrt(simd_float8 x) {
+#if defined __AVX512VL__
+ return _mm256_rsqrt14_ps(x);
+#elif defined __AVX__
+ return _mm256_rsqrt_ps(x);
+#else
+ return simd_make_float8(simd_fast_rsqrt(x.lo), simd_fast_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fast_rsqrt(simd_float16 x) {
+#if defined __AVX512F__
+ return _mm512_rsqrt14_ps(x);
+#else
+ return simd_make_float16(simd_fast_rsqrt(x.lo), simd_fast_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_fast_rsqrt(double x) {
+ return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fast_rsqrt(simd_double2 x) {
+ return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fast_rsqrt(simd_double3 x) {
+ return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fast_rsqrt(simd_double4 x) {
+ return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fast_rsqrt(simd_double8 x) {
+ return simd_precise_rsqrt(x);
+}
+
+static inline SIMD_CFUNC float simd_precise_rsqrt(float x) {
+#if defined __SSE__
+ float r = simd_fast_rsqrt(x);
+ return r*(1.5f - 0.5f*(r == INFINITY ? -INFINITY : x)*r*r);
+#elif defined __ARM_NEON__
+ return simd_precise_rsqrt(simd_make_float2_undef(x)).x;
+#else
+ return 1/sqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float2 simd_precise_rsqrt(simd_float2 x) {
+#if defined __SSE__
+ return simd_make_float2(simd_precise_rsqrt(simd_make_float4_undef(x)));
+#elif defined __ARM_NEON__
+ simd_float2 r = simd_fast_rsqrt(x);
+ return r*vrsqrts_f32(x, r*r);
+#else
+ return 1/__tg_sqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 simd_precise_rsqrt(simd_float3 x) {
+ return simd_make_float3(simd_precise_rsqrt(simd_make_float4_undef(x)));
+}
+
+static inline SIMD_CFUNC simd_float4 simd_precise_rsqrt(simd_float4 x) {
+#if defined __SSE__
+ simd_float4 r = simd_fast_rsqrt(x);
+ return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#elif defined __ARM_NEON__
+ simd_float4 r = simd_fast_rsqrt(x);
+ return r*vrsqrtsq_f32(x, r*r);
+#else
+ return 1/__tg_sqrt(x);
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 simd_precise_rsqrt(simd_float8 x) {
+#if defined __AVX__
+ simd_float8 r = simd_fast_rsqrt(x);
+ return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#else
+ return simd_make_float8(simd_precise_rsqrt(x.lo), simd_precise_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 simd_precise_rsqrt(simd_float16 x) {
+#if defined __AVX512F__
+ simd_float16 r = simd_fast_rsqrt(x);
+ return r*(1.5 - 0.5*simd_bitselect(x, -INFINITY, r == INFINITY)*r*r);
+#else
+ return simd_make_float16(simd_precise_rsqrt(x.lo), simd_precise_rsqrt(x.hi));
+#endif
+}
+
+static inline SIMD_CFUNC double simd_precise_rsqrt(double x) {
+ return 1/sqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_precise_rsqrt(simd_double2 x) {
+ return 1/__tg_sqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_precise_rsqrt(simd_double3 x) {
+ return 1/__tg_sqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_precise_rsqrt(simd_double4 x) {
+ return 1/__tg_sqrt(x);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_precise_rsqrt(simd_double8 x) {
+ return 1/__tg_sqrt(x);
+}
+
+static inline SIMD_CFUNC float simd_fract(float x) {
+ return fmin(x - floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_fract(simd_float2 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_fract(simd_float3 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_fract(simd_float4 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_fract(simd_float8 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_fract(simd_float16 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffep-1f);
+}
+
+static inline SIMD_CFUNC double simd_fract(double x) {
+ return fmin(x - floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_fract(simd_double2 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_fract(simd_double3 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_fract(simd_double4 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_fract(simd_double8 x) {
+ return __tg_fmin(x - __tg_floor(x), 0x1.fffffffffffffp-1);
+}
+
+static inline SIMD_CFUNC float simd_step(float edge, float x) {
+ return !(x < edge);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_step(simd_float2 edge, simd_float2 x) {
+ return simd_bitselect((simd_float2)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_step(simd_float3 edge, simd_float3 x) {
+ return simd_bitselect((simd_float3)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_step(simd_float4 edge, simd_float4 x) {
+ return simd_bitselect((simd_float4)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_step(simd_float8 edge, simd_float8 x) {
+ return simd_bitselect((simd_float8)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_step(simd_float16 edge, simd_float16 x) {
+ return simd_bitselect((simd_float16)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC double simd_step(double edge, double x) {
+ return !(x < edge);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_step(simd_double2 edge, simd_double2 x) {
+ return simd_bitselect((simd_double2)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_step(simd_double3 edge, simd_double3 x) {
+ return simd_bitselect((simd_double3)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_step(simd_double4 edge, simd_double4 x) {
+ return simd_bitselect((simd_double4)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_step(simd_double8 edge, simd_double8 x) {
+ return simd_bitselect((simd_double8)1, 0, x < edge);
+}
+
+static inline SIMD_CFUNC float simd_smoothstep(float edge0, float edge1, float x) {
+ float t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_smoothstep(simd_float2 edge0, simd_float2 edge1, simd_float2 x) {
+ simd_float2 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_smoothstep(simd_float3 edge0, simd_float3 edge1, simd_float3 x) {
+ simd_float3 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float4 simd_smoothstep(simd_float4 edge0, simd_float4 edge1, simd_float4 x) {
+ simd_float4 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float8 simd_smoothstep(simd_float8 edge0, simd_float8 edge1, simd_float8 x) {
+ simd_float8 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_float16 simd_smoothstep(simd_float16 edge0, simd_float16 edge1, simd_float16 x) {
+ simd_float16 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC double simd_smoothstep(double edge0, double edge1, double x) {
+ double t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double2 simd_smoothstep(simd_double2 edge0, simd_double2 edge1, simd_double2 x) {
+ simd_double2 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_smoothstep(simd_double3 edge0, simd_double3 edge1, simd_double3 x) {
+ simd_double3 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double4 simd_smoothstep(simd_double4 edge0, simd_double4 edge1, simd_double4 x) {
+ simd_double4 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC simd_double8 simd_smoothstep(simd_double8 edge0, simd_double8 edge1, simd_double8 x) {
+ simd_double8 t = simd_clamp((x - edge0)/(edge1 - edge0), 0, 1);
+ return t*t*(3 - 2*t);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char32 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_add(simd_char64 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar32 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_add(simd_uchar64 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC short simd_reduce_add(simd_short32 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_add(simd_ushort32 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC int simd_reduce_add(simd_int16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_add(simd_uint16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC float simd_reduce_add(simd_float16 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_add(simd_long8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_add(simd_ulong8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double2 x) {
+ return x.x + x.y;
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double3 x) {
+ return x.x + x.y + x.z;
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double4 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC double simd_reduce_add(simd_double8 x) {
+ return simd_reduce_add(x.lo + x.hi);
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char3 x) {
+ char t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char32 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_min(simd_char64 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar3 x) {
+ unsigned char t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar32 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_min(simd_uchar64 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short3 x) {
+ short t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_min(simd_short32 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort3 x) {
+ unsigned short t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_min(simd_ushort32 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int3 x) {
+ int t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_min(simd_int16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint3 x) {
+ unsigned int t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_min(simd_uint16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float2 x) {
+ return fmin(x.x, x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float3 x) {
+ return fmin(fmin(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_min(simd_float16 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long3 x) {
+ simd_long1 t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_min(simd_long8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong2 x) {
+ return x.y < x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong3 x) {
+ simd_ulong1 t = x.z < x.x ? x.z : x.x;
+ return x.y < t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_min(simd_ulong8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double2 x) {
+ return fmin(x.x, x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double3 x) {
+ return fmin(fmin(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double4 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_min(simd_double8 x) {
+ return simd_reduce_min(simd_min(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char3 x) {
+ char t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char32 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC char simd_reduce_max(simd_char64 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar3 x) {
+ unsigned char t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar32 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned char simd_reduce_max(simd_uchar64 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short3 x) {
+ short t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC short simd_reduce_max(simd_short32 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort3 x) {
+ unsigned short t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned short simd_reduce_max(simd_ushort32 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int3 x) {
+ int t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC int simd_reduce_max(simd_int16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint3 x) {
+ unsigned int t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC unsigned int simd_reduce_max(simd_uint16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float2 x) {
+ return fmax(x.x, x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float3 x) {
+ return fmax(fmax(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC float simd_reduce_max(simd_float16 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long3 x) {
+ simd_long1 t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_long1 simd_reduce_max(simd_long8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong2 x) {
+ return x.y > x.x ? x.y : x.x;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong3 x) {
+ simd_ulong1 t = x.z > x.x ? x.z : x.x;
+ return x.y > t ? x.y : t;
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC simd_ulong1 simd_reduce_max(simd_ulong8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double2 x) {
+ return fmax(x.x, x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double3 x) {
+ return fmax(fmax(x.x, x.z), x.y);
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double4 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+static inline SIMD_CFUNC double simd_reduce_max(simd_double8 x) {
+ return simd_reduce_max(simd_max(x.lo, x.hi));
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_COMMON_HEADER */ \ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/conversion.h b/lib/libc/include/aarch64-macos-gnu/simd/conversion.h
new file mode 100644
index 0000000000..6379afde05
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/conversion.h
@@ -0,0 +1,1966 @@
+/* Copyright (c) 2014-2017 Apple, Inc. All rights reserved.
+ *
+ * The interfaces declared in this header provide conversions between vector
+ * types. The following functions are available:
+ *
+ * simd_char(x) simd_uchar(x)
+ * simd_short(x) simd_ushort(x)
+ * simd_int(x) simd_uint(x)
+ * simd_long(x) simd_ulong(x)
+ * simd_float(x)
+ * simd_double(x)
+ *
+ * Each of these functions converts x to a vector whose elements have the
+ * type named by the function, with the same number of elements as x. Unlike
+ * a vector cast, these functions convert the elements to the new element
+ * type. These conversions behave exactly as C scalar conversions, except
+ * that conversions from integer vector types to signed integer vector types
+ * are guaranteed to wrap modulo 2^N (where N is the number of bits in an
+ * element of the result type).
+ *
+ * For integer vector types, saturating conversions are also available:
+ *
+ * simd_char_sat(x) simd_uchar_sat(x)
+ * simd_short_sat(x) simd_ushort_sat(x)
+ * simd_int_sat(x) simd_uint_sat(x)
+ * simd_long_sat(x) simd_ulong_sat(x)
+ *
+ * These conversions clamp x to the representable range of the result type
+ * before converting.
+ *
+ * Unlike most vector operations in <simd/>, there are no abbreviated C++
+ * names for these functions in the simd:: namespace.
+ */
+
+#ifndef __SIMD_CONVERSION_HEADER__
+#define __SIMD_CONVERSION_HEADER__
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_types.h>
+#include <simd/common.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static simd_char2 SIMD_CFUNC simd_char(simd_char2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_char3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_char4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_char8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_char16 __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_char32 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_uchar2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_uchar3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_uchar4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_uchar8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_uchar16 __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_uchar32 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_short2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_short3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_short4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_short8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_short16 __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_short32 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_ushort2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_ushort3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_ushort4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_ushort8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_ushort16 __x);
+static simd_char32 SIMD_CFUNC simd_char(simd_ushort32 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_int2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_int3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_int4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_int8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_int16 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_uint2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_uint3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_uint4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_uint8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_uint16 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_float2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_float3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_float4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_float8 __x);
+static simd_char16 SIMD_CFUNC simd_char(simd_float16 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_long2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_long3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_long4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_long8 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_ulong2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_ulong3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_ulong4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_ulong8 __x);
+static simd_char2 SIMD_CFUNC simd_char(simd_double2 __x);
+static simd_char3 SIMD_CFUNC simd_char(simd_double3 __x);
+static simd_char4 SIMD_CFUNC simd_char(simd_double4 __x);
+static simd_char8 SIMD_CFUNC simd_char(simd_double8 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_char2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_char3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_char4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_char8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_char16 __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_char32 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_short2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_short3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_short4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_short8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_short16 __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_short32 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_int2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_int3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_int4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_int8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_int16 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_float2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_float3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_float4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_float8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_float16 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_long2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_long3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_long4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_long8 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_double2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_double3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_double4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_double8 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_uchar2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_uchar3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_uchar4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_uchar8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uchar16 __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_uchar32 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_ushort2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_ushort3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_ushort4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_ushort8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_ushort16 __x);
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_ushort32 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_uint2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_uint3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_uint4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_uint8 __x);
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uint16 __x);
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_ulong2 __x);
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_ulong3 __x);
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_ulong4 __x);
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_ulong8 __x);
+#define vector_char simd_char
+#define vector_char_sat simd_char_sat
+
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_char2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_char3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_char4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_char8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_char16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_char32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_uchar2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_uchar3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_uchar4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_uchar8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uchar16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_uchar32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_short2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_short3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_short4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_short8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_short16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_short32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_ushort2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_ushort3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_ushort4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_ushort8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_ushort16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_ushort32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_int2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_int3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_int4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_int8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_int16 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_uint2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_uint3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_uint4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_uint8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uint16 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_float2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_float3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_float4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_float8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_float16 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_long2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_long3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_long4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_long8 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_ulong2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_ulong3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_ulong4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_ulong8 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_double2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_double3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_double4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_double8 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_char2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_char3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_char4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_char8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_char16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_char32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_short2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_short3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_short4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_short8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_short16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_short32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_int2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_int3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_int4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_int8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_int16 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_float2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_float3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_float4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_float8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_float16 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_long2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_long3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_long4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_long8 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_double2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_double3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_double4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_double8 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_uchar2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_uchar3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_uchar4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_uchar8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uchar16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_uchar32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_ushort2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_ushort3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_ushort4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_ushort8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_ushort16 __x);
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_ushort32 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_uint2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_uint3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_uint4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_uint8 __x);
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uint16 __x);
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_ulong2 __x);
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_ulong3 __x);
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_ulong4 __x);
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_ulong8 __x);
+#define vector_uchar simd_uchar
+#define vector_uchar_sat simd_uchar_sat
+
+static simd_short2 SIMD_CFUNC simd_short(simd_char2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_char3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_char4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_char8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_char16 __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_char32 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_uchar2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_uchar3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_uchar4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_uchar8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_uchar16 __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_uchar32 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_short2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_short3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_short4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_short8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_short16 __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_short32 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_ushort2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_ushort3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_ushort4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_ushort8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_ushort16 __x);
+static simd_short32 SIMD_CFUNC simd_short(simd_ushort32 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_int2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_int3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_int4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_int8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_int16 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_uint2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_uint3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_uint4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_uint8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_uint16 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_float2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_float3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_float4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_float8 __x);
+static simd_short16 SIMD_CFUNC simd_short(simd_float16 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_long2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_long3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_long4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_long8 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_ulong2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_ulong3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_ulong4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_ulong8 __x);
+static simd_short2 SIMD_CFUNC simd_short(simd_double2 __x);
+static simd_short3 SIMD_CFUNC simd_short(simd_double3 __x);
+static simd_short4 SIMD_CFUNC simd_short(simd_double4 __x);
+static simd_short8 SIMD_CFUNC simd_short(simd_double8 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_char2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_char3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_char4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_char8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_char16 __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_char32 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_short2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_short3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_short4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_short8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_short16 __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_short32 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_int2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_int3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_int4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_int8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_int16 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_float2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_float3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_float4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_float8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_float16 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_long2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_long3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_long4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_long8 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_double2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_double3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_double4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_double8 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_uchar2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_uchar3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_uchar4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_uchar8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uchar16 __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_uchar32 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_ushort2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_ushort3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_ushort4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_ushort8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_ushort16 __x);
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_ushort32 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_uint2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_uint3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_uint4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_uint8 __x);
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uint16 __x);
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_ulong2 __x);
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_ulong3 __x);
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_ulong4 __x);
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_ulong8 __x);
+#define vector_short simd_short
+#define vector_short_sat simd_short_sat
+
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_char2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_char3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_char4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_char8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_char16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_char32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_uchar2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_uchar3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_uchar4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_uchar8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uchar16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_uchar32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_short2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_short3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_short4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_short8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_short16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_short32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_ushort2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_ushort3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_ushort4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_ushort8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_ushort16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_ushort32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_int2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_int3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_int4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_int8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_int16 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_uint2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_uint3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_uint4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_uint8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uint16 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_float2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_float3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_float4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_float8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_float16 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_long2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_long3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_long4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_long8 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_ulong2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_ulong3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_ulong4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_ulong8 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_double2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_double3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_double4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_double8 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_char2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_char3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_char4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_char8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_char16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_char32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_short2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_short3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_short4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_short8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_short16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_short32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_int2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_int3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_int4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_int8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_int16 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_float2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_float3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_float4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_float8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_float16 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_long2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_long3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_long4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_long8 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_double2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_double3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_double4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_double8 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_uchar2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_uchar3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_uchar4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_uchar8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uchar16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_uchar32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_ushort2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_ushort3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_ushort4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_ushort8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_ushort16 __x);
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_ushort32 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_uint2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_uint3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_uint4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_uint8 __x);
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uint16 __x);
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_ulong2 __x);
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_ulong3 __x);
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_ulong4 __x);
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_ulong8 __x);
+#define vector_ushort simd_ushort
+#define vector_ushort_sat simd_ushort_sat
+
+static simd_int2 SIMD_CFUNC simd_int(simd_char2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_char3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_char4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_char8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_char16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_uchar2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_uchar3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_uchar4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_uchar8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_uchar16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_short2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_short3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_short4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_short8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_short16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_ushort2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_ushort3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_ushort4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_ushort8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_ushort16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_int2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_int3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_int4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_int8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_int16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_uint2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_uint3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_uint4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_uint8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_uint16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_float2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_float3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_float4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_float8 __x);
+static simd_int16 SIMD_CFUNC simd_int(simd_float16 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_long2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_long3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_long4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_long8 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_ulong2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_ulong3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_ulong4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_ulong8 __x);
+static simd_int2 SIMD_CFUNC simd_int(simd_double2 __x);
+static simd_int3 SIMD_CFUNC simd_int(simd_double3 __x);
+static simd_int4 SIMD_CFUNC simd_int(simd_double4 __x);
+static simd_int8 SIMD_CFUNC simd_int(simd_double8 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_char2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_char3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_char4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_char8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_char16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_short2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_short3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_short4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_short8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_short16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_int2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_int3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_int4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_int8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_int16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_float2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_float3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_float4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_float8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_float16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_long2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_long3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_long4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_long8 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_double2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_double3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_double4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_double8 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_uchar2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_uchar3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_uchar4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_uchar8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uchar16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_ushort2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_ushort3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_ushort4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_ushort8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_ushort16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_uint2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_uint3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_uint4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_uint8 __x);
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uint16 __x);
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_ulong2 __x);
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_ulong3 __x);
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_ulong4 __x);
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_ulong8 __x);
+static simd_int2 SIMD_CFUNC simd_int_rte(simd_float2 __x);
+static simd_int3 SIMD_CFUNC simd_int_rte(simd_float3 __x);
+static simd_int4 SIMD_CFUNC simd_int_rte(simd_float4 __x);
+static simd_int8 SIMD_CFUNC simd_int_rte(simd_float8 __x);
+static simd_int16 SIMD_CFUNC simd_int_rte(simd_float16 __x);
+#define vector_int simd_int
+#define vector_int_sat simd_int_sat
+
+static simd_uint2 SIMD_CFUNC simd_uint(simd_char2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_char3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_char4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_char8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_char16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_uchar2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_uchar3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_uchar4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_uchar8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uchar16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_short2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_short3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_short4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_short8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_short16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_ushort2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_ushort3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_ushort4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_ushort8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_ushort16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_int2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_int3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_int4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_int8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_int16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_uint2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_uint3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_uint4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_uint8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uint16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_float2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_float3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_float4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_float8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint(simd_float16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_long2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_long3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_long4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_long8 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_ulong2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_ulong3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_ulong4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_ulong8 __x);
+static simd_uint2 SIMD_CFUNC simd_uint(simd_double2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint(simd_double3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint(simd_double4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint(simd_double8 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_char2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_char3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_char4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_char8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_char16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_short2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_short3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_short4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_short8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_short16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_int2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_int3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_int4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_int8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_int16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_float2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_float3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_float4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_float8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_float16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_long2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_long3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_long4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_long8 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_double2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_double3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_double4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_double8 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_uchar2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_uchar3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_uchar4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_uchar8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uchar16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_ushort2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_ushort3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_ushort4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_ushort8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_ushort16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_uint2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_uint3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_uint4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_uint8 __x);
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uint16 __x);
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_ulong2 __x);
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_ulong3 __x);
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_ulong4 __x);
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_ulong8 __x);
+#define vector_uint simd_uint
+#define vector_uint_sat simd_uint_sat
+
+static simd_float2 SIMD_CFUNC simd_float(simd_char2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_char3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_char4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_char8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_char16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_uchar2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_uchar3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_uchar4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_uchar8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_uchar16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_short2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_short3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_short4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_short8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_short16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_ushort2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_ushort3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_ushort4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_ushort8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_ushort16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_int2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_int3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_int4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_int8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_int16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_uint2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_uint3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_uint4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_uint8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_uint16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_float2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_float3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_float4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_float8 __x);
+static simd_float16 SIMD_CFUNC simd_float(simd_float16 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_long2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_long3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_long4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_long8 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_ulong2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_ulong3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_ulong4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_ulong8 __x);
+static simd_float2 SIMD_CFUNC simd_float(simd_double2 __x);
+static simd_float3 SIMD_CFUNC simd_float(simd_double3 __x);
+static simd_float4 SIMD_CFUNC simd_float(simd_double4 __x);
+static simd_float8 SIMD_CFUNC simd_float(simd_double8 __x);
+#define vector_float simd_float
+
+static simd_long2 SIMD_CFUNC simd_long(simd_char2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_char3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_char4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_char8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_uchar2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_uchar3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_uchar4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_uchar8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_short2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_short3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_short4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_short8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_ushort2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_ushort3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_ushort4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_ushort8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_int2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_int3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_int4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_int8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_uint2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_uint3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_uint4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_uint8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_float2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_float3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_float4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_float8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_long2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_long3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_long4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_long8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_ulong2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_ulong3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_ulong4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_ulong8 __x);
+static simd_long2 SIMD_CFUNC simd_long(simd_double2 __x);
+static simd_long3 SIMD_CFUNC simd_long(simd_double3 __x);
+static simd_long4 SIMD_CFUNC simd_long(simd_double4 __x);
+static simd_long8 SIMD_CFUNC simd_long(simd_double8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_char2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_char3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_char4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_char8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_short2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_short3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_short4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_short8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_int2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_int3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_int4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_int8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_float2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_float3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_float4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_float8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_long2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_long3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_long4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_long8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_double2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_double3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_double4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_double8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_uchar2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_uchar3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_uchar4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_uchar8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_ushort2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_ushort3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_ushort4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_ushort8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_uint2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_uint3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_uint4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_uint8 __x);
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_ulong2 __x);
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_ulong3 __x);
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_ulong4 __x);
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_ulong8 __x);
+static simd_long2 SIMD_CFUNC simd_long_rte(simd_double2 __x);
+static simd_long3 SIMD_CFUNC simd_long_rte(simd_double3 __x);
+static simd_long4 SIMD_CFUNC simd_long_rte(simd_double4 __x);
+static simd_long8 SIMD_CFUNC simd_long_rte(simd_double8 __x);
+#define vector_long simd_long
+#define vector_long_sat simd_long_sat
+
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_char2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_char3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_char4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_char8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_uchar2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_uchar3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_uchar4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_uchar8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_short2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_short3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_short4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_short8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_ushort2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_ushort3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_ushort4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_ushort8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_int2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_int3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_int4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_int8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_uint2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_uint3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_uint4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_uint8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_float2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_float3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_float4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_float8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_long2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_long3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_long4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_long8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_ulong2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_ulong3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_ulong4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_ulong8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_double2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_double3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_double4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_double8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_char2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_char3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_char4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_char8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_short2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_short3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_short4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_short8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_int2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_int3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_int4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_int8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_float2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_float3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_float4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_float8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_long2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_long3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_long4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_long8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_double2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_double3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_double4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_double8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_uchar2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_uchar3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_uchar4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_uchar8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_ushort2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_ushort3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_ushort4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_ushort8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_uint2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_uint3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_uint4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_uint8 __x);
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_ulong2 __x);
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_ulong3 __x);
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_ulong4 __x);
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_ulong8 __x);
+#define vector_ulong simd_ulong
+#define vector_ulong_sat simd_ulong_sat
+
+static simd_double2 SIMD_CFUNC simd_double(simd_char2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_char3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_char4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_char8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_uchar2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_uchar3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_uchar4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_uchar8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_short2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_short3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_short4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_short8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_ushort2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_ushort3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_ushort4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_ushort8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_int2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_int3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_int4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_int8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_uint2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_uint3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_uint4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_uint8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_float2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_float3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_float4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_float8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_long2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_long3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_long4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_long8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_ulong2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_ulong3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_ulong4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_ulong8 __x);
+static simd_double2 SIMD_CFUNC simd_double(simd_double2 __x);
+static simd_double3 SIMD_CFUNC simd_double(simd_double3 __x);
+static simd_double4 SIMD_CFUNC simd_double(simd_double4 __x);
+static simd_double8 SIMD_CFUNC simd_double(simd_double8 __x);
+#define vector_double simd_double
+
+static simd_char2 SIMD_CFUNC vector2(char __x, char __y) { return ( simd_char2){__x, __y}; }
+static simd_uchar2 SIMD_CFUNC vector2(unsigned char __x, unsigned char __y) { return ( simd_uchar2){__x, __y}; }
+static simd_short2 SIMD_CFUNC vector2(short __x, short __y) { return ( simd_short2){__x, __y}; }
+static simd_ushort2 SIMD_CFUNC vector2(unsigned short __x, unsigned short __y) { return (simd_ushort2){__x, __y}; }
+static simd_int2 SIMD_CFUNC vector2(int __x, int __y) { return ( simd_int2){__x, __y}; }
+static simd_uint2 SIMD_CFUNC vector2(unsigned int __x, unsigned int __y) { return ( simd_uint2){__x, __y}; }
+static simd_float2 SIMD_CFUNC vector2(float __x, float __y) { return ( simd_float2){__x, __y}; }
+static simd_long2 SIMD_CFUNC vector2(simd_long1 __x, simd_long1 __y) { return ( simd_long2){__x, __y}; }
+static simd_ulong2 SIMD_CFUNC vector2(simd_ulong1 __x, simd_ulong1 __y) { return ( simd_ulong2){__x, __y}; }
+static simd_double2 SIMD_CFUNC vector2(double __x, double __y) { return (simd_double2){__x, __y}; }
+
+static simd_char3 SIMD_CFUNC vector3(char __x, char __y, char __z) { return ( simd_char3){__x, __y, __z}; }
+static simd_uchar3 SIMD_CFUNC vector3(unsigned char __x, unsigned char __y, unsigned char __z) { return ( simd_uchar3){__x, __y, __z}; }
+static simd_short3 SIMD_CFUNC vector3(short __x, short __y, short __z) { return ( simd_short3){__x, __y, __z}; }
+static simd_ushort3 SIMD_CFUNC vector3(unsigned short __x, unsigned short __y, unsigned short __z) { return (simd_ushort3){__x, __y, __z}; }
+static simd_int3 SIMD_CFUNC vector3(int __x, int __y, int __z) { return ( simd_int3){__x, __y, __z}; }
+static simd_uint3 SIMD_CFUNC vector3(unsigned int __x, unsigned int __y, unsigned int __z) { return ( simd_uint3){__x, __y, __z}; }
+static simd_float3 SIMD_CFUNC vector3(float __x, float __y, float __z) { return ( simd_float3){__x, __y, __z}; }
+static simd_long3 SIMD_CFUNC vector3(simd_long1 __x, simd_long1 __y, simd_long1 __z) { return ( simd_long3){__x, __y, __z}; }
+static simd_ulong3 SIMD_CFUNC vector3(simd_ulong1 __x, simd_ulong1 __y, simd_ulong1 __z) { return ( simd_ulong3){__x, __y, __z}; }
+static simd_double3 SIMD_CFUNC vector3(double __x, double __y, double __z) { return (simd_double3){__x, __y, __z}; }
+
+static simd_char3 SIMD_CFUNC vector3(simd_char2 __xy, char __z) { simd_char3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_uchar3 SIMD_CFUNC vector3(simd_uchar2 __xy, unsigned char __z) { simd_uchar3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_short3 SIMD_CFUNC vector3(simd_short2 __xy, short __z) { simd_short3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_ushort3 SIMD_CFUNC vector3(simd_ushort2 __xy, unsigned short __z) { simd_ushort3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_int3 SIMD_CFUNC vector3(simd_int2 __xy, int __z) { simd_int3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_uint3 SIMD_CFUNC vector3(simd_uint2 __xy, unsigned int __z) { simd_uint3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_float3 SIMD_CFUNC vector3(simd_float2 __xy, float __z) { simd_float3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_long3 SIMD_CFUNC vector3(simd_long2 __xy, simd_long1 __z) { simd_long3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_ulong3 SIMD_CFUNC vector3(simd_ulong2 __xy, simd_ulong1 __z) { simd_ulong3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+static simd_double3 SIMD_CFUNC vector3(simd_double2 __xy, double __z) { simd_double3 __r; __r.xy = __xy; __r.z = __z; return __r; }
+
+static simd_char4 SIMD_CFUNC vector4(char __x, char __y, char __z, char __w) { return ( simd_char4){__x, __y, __z, __w}; }
+static simd_uchar4 SIMD_CFUNC vector4(unsigned char __x, unsigned char __y, unsigned char __z, unsigned char __w) { return ( simd_uchar4){__x, __y, __z, __w}; }
+static simd_short4 SIMD_CFUNC vector4(short __x, short __y, short __z, short __w) { return ( simd_short4){__x, __y, __z, __w}; }
+static simd_ushort4 SIMD_CFUNC vector4(unsigned short __x, unsigned short __y, unsigned short __z, unsigned short __w) { return (simd_ushort4){__x, __y, __z, __w}; }
+static simd_int4 SIMD_CFUNC vector4(int __x, int __y, int __z, int __w) { return ( simd_int4){__x, __y, __z, __w}; }
+static simd_uint4 SIMD_CFUNC vector4(unsigned int __x, unsigned int __y, unsigned int __z, unsigned int __w) { return ( simd_uint4){__x, __y, __z, __w}; }
+static simd_float4 SIMD_CFUNC vector4(float __x, float __y, float __z, float __w) { return ( simd_float4){__x, __y, __z, __w}; }
+static simd_long4 SIMD_CFUNC vector4(simd_long1 __x, simd_long1 __y, simd_long1 __z, simd_long1 __w) { return ( simd_long4){__x, __y, __z, __w}; }
+static simd_ulong4 SIMD_CFUNC vector4(simd_ulong1 __x, simd_ulong1 __y, simd_ulong1 __z, simd_ulong1 __w) { return ( simd_ulong4){__x, __y, __z, __w}; }
+static simd_double4 SIMD_CFUNC vector4(double __x, double __y, double __z, double __w) { return (simd_double4){__x, __y, __z, __w}; }
+
+static simd_char4 SIMD_CFUNC vector4(simd_char2 __xy, simd_char2 __zw) { simd_char4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_uchar4 SIMD_CFUNC vector4(simd_uchar2 __xy, simd_uchar2 __zw) { simd_uchar4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_short4 SIMD_CFUNC vector4(simd_short2 __xy, simd_short2 __zw) { simd_short4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_ushort4 SIMD_CFUNC vector4(simd_ushort2 __xy, simd_ushort2 __zw) { simd_ushort4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_int4 SIMD_CFUNC vector4(simd_int2 __xy, simd_int2 __zw) { simd_int4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_uint4 SIMD_CFUNC vector4(simd_uint2 __xy, simd_uint2 __zw) { simd_uint4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_float4 SIMD_CFUNC vector4(simd_float2 __xy, simd_float2 __zw) { simd_float4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_long4 SIMD_CFUNC vector4(simd_long2 __xy, simd_long2 __zw) { simd_long4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_ulong4 SIMD_CFUNC vector4(simd_ulong2 __xy, simd_ulong2 __zw) { simd_ulong4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+static simd_double4 SIMD_CFUNC vector4(simd_double2 __xy, simd_double2 __zw) { simd_double4 __r; __r.xy = __xy; __r.zw = __zw; return __r; }
+
+static simd_char4 SIMD_CFUNC vector4(simd_char3 __xyz, char __w) { simd_char4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_uchar4 SIMD_CFUNC vector4(simd_uchar3 __xyz, unsigned char __w) { simd_uchar4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_short4 SIMD_CFUNC vector4(simd_short3 __xyz, short __w) { simd_short4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_ushort4 SIMD_CFUNC vector4(simd_ushort3 __xyz, unsigned short __w) { simd_ushort4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_int4 SIMD_CFUNC vector4(simd_int3 __xyz, int __w) { simd_int4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_uint4 SIMD_CFUNC vector4(simd_uint3 __xyz, unsigned int __w) { simd_uint4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_float4 SIMD_CFUNC vector4(simd_float3 __xyz, float __w) { simd_float4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_long4 SIMD_CFUNC vector4(simd_long3 __xyz, simd_long1 __w) { simd_long4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_ulong4 SIMD_CFUNC vector4(simd_ulong3 __xyz, simd_ulong1 __w) { simd_ulong4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+static simd_double4 SIMD_CFUNC vector4(simd_double3 __xyz, double __w) { simd_double4 __r; __r.xyz = __xyz; __r.w = __w; return __r; }
+
+static simd_char8 SIMD_CFUNC vector8(simd_char4 __lo, simd_char4 __hi) { simd_char8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar8 SIMD_CFUNC vector8(simd_uchar4 __lo, simd_uchar4 __hi) { simd_uchar8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short8 SIMD_CFUNC vector8(simd_short4 __lo, simd_short4 __hi) { simd_short8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort8 SIMD_CFUNC vector8(simd_ushort4 __lo, simd_ushort4 __hi) { simd_ushort8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_int8 SIMD_CFUNC vector8(simd_int4 __lo, simd_int4 __hi) { simd_int8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uint8 SIMD_CFUNC vector8(simd_uint4 __lo, simd_uint4 __hi) { simd_uint8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_float8 SIMD_CFUNC vector8(simd_float4 __lo, simd_float4 __hi) { simd_float8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_long8 SIMD_CFUNC vector8(simd_long4 __lo, simd_long4 __hi) { simd_long8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ulong8 SIMD_CFUNC vector8(simd_ulong4 __lo, simd_ulong4 __hi) { simd_ulong8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_double8 SIMD_CFUNC vector8(simd_double4 __lo, simd_double4 __hi) { simd_double8 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+static simd_char16 SIMD_CFUNC vector16(simd_char8 __lo, simd_char8 __hi) { simd_char16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar16 SIMD_CFUNC vector16(simd_uchar8 __lo, simd_uchar8 __hi) { simd_uchar16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short16 SIMD_CFUNC vector16(simd_short8 __lo, simd_short8 __hi) { simd_short16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort16 SIMD_CFUNC vector16(simd_ushort8 __lo, simd_ushort8 __hi) { simd_ushort16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_int16 SIMD_CFUNC vector16(simd_int8 __lo, simd_int8 __hi) { simd_int16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uint16 SIMD_CFUNC vector16(simd_uint8 __lo, simd_uint8 __hi) { simd_uint16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_float16 SIMD_CFUNC vector16(simd_float8 __lo, simd_float8 __hi) { simd_float16 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+static simd_char32 SIMD_CFUNC vector32(simd_char16 __lo, simd_char16 __hi) { simd_char32 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_uchar32 SIMD_CFUNC vector32(simd_uchar16 __lo, simd_uchar16 __hi) { simd_uchar32 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_short32 SIMD_CFUNC vector32(simd_short16 __lo, simd_short16 __hi) { simd_short32 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+static simd_ushort32 SIMD_CFUNC vector32(simd_ushort16 __lo, simd_ushort16 __hi) { simd_ushort32 __r; __r.lo = __lo; __r.hi = __hi; return __r; }
+
+#pragma mark - Implementation
+
+static simd_char2 SIMD_CFUNC simd_char(simd_char2 __x) { return __x; }
+static simd_char3 SIMD_CFUNC simd_char(simd_char3 __x) { return __x; }
+static simd_char4 SIMD_CFUNC simd_char(simd_char4 __x) { return __x; }
+static simd_char8 SIMD_CFUNC simd_char(simd_char8 __x) { return __x; }
+static simd_char16 SIMD_CFUNC simd_char(simd_char16 __x) { return __x; }
+static simd_char32 SIMD_CFUNC simd_char(simd_char32 __x) { return __x; }
+static simd_char2 SIMD_CFUNC simd_char(simd_uchar2 __x) { return (simd_char2)__x; }
+static simd_char3 SIMD_CFUNC simd_char(simd_uchar3 __x) { return (simd_char3)__x; }
+static simd_char4 SIMD_CFUNC simd_char(simd_uchar4 __x) { return (simd_char4)__x; }
+static simd_char8 SIMD_CFUNC simd_char(simd_uchar8 __x) { return (simd_char8)__x; }
+static simd_char16 SIMD_CFUNC simd_char(simd_uchar16 __x) { return (simd_char16)__x; }
+static simd_char32 SIMD_CFUNC simd_char(simd_uchar32 __x) { return (simd_char32)__x; }
+static simd_char2 SIMD_CFUNC simd_char(simd_short2 __x) { return __builtin_convertvector(__x & 0xff, simd_char2); }
+static simd_char3 SIMD_CFUNC simd_char(simd_short3 __x) { return __builtin_convertvector(__x & 0xff, simd_char3); }
+static simd_char4 SIMD_CFUNC simd_char(simd_short4 __x) { return __builtin_convertvector(__x & 0xff, simd_char4); }
+static simd_char8 SIMD_CFUNC simd_char(simd_short8 __x) { return __builtin_convertvector(__x & 0xff, simd_char8); }
+static simd_char16 SIMD_CFUNC simd_char(simd_short16 __x) { return __builtin_convertvector(__x & 0xff, simd_char16); }
+static simd_char32 SIMD_CFUNC simd_char(simd_short32 __x) { return __builtin_convertvector(__x & 0xff, simd_char32); }
+static simd_char2 SIMD_CFUNC simd_char(simd_ushort2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_ushort3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_ushort4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_ushort8 __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_ushort16 __x) { return simd_char(simd_short(__x)); }
+static simd_char32 SIMD_CFUNC simd_char(simd_ushort32 __x) { return simd_char(simd_short(__x)); }
+static simd_char2 SIMD_CFUNC simd_char(simd_int2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_int3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_int4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_int8 __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_int16 __x) { return simd_char(simd_short(__x)); }
+static simd_char2 SIMD_CFUNC simd_char(simd_uint2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_uint3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_uint4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_uint8 __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_uint16 __x) { return simd_char(simd_short(__x)); }
+static simd_char2 SIMD_CFUNC simd_char(simd_float2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_float3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_float4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_float8 __x) { return simd_char(simd_short(__x)); }
+static simd_char16 SIMD_CFUNC simd_char(simd_float16 __x) { return simd_char(simd_short(__x)); }
+static simd_char2 SIMD_CFUNC simd_char(simd_long2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_long3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_long4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_long8 __x) { return simd_char(simd_short(__x)); }
+static simd_char2 SIMD_CFUNC simd_char(simd_ulong2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_ulong3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_ulong4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_ulong8 __x) { return simd_char(simd_short(__x)); }
+static simd_char2 SIMD_CFUNC simd_char(simd_double2 __x) { return simd_char(simd_short(__x)); }
+static simd_char3 SIMD_CFUNC simd_char(simd_double3 __x) { return simd_char(simd_short(__x)); }
+static simd_char4 SIMD_CFUNC simd_char(simd_double4 __x) { return simd_char(simd_short(__x)); }
+static simd_char8 SIMD_CFUNC simd_char(simd_double8 __x) { return simd_char(simd_short(__x)); }
+
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_char2 __x) { return __x; }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_char3 __x) { return __x; }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_char4 __x) { return __x; }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_char8 __x) { return __x; }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_char16 __x) { return __x; }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_char32 __x) { return __x; }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_short2 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_short3 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_short4 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_short8 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_short16 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_short32 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_int2 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_int3 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_int4 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_int8 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_int16 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_float2 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_float3 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_float4 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_float8 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_float16 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_long2 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_long3 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_long4 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_long8 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_double2 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_double3 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_double4 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_double8 __x) { return simd_char(simd_clamp(__x,-0x80,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_uchar2 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_uchar3 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_uchar4 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_uchar8 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uchar16 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_uchar32 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_ushort2 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_ushort3 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_ushort4 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_ushort8 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_ushort16 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char32 SIMD_CFUNC simd_char_sat(simd_ushort32 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_uint2 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_uint3 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_uint4 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_uint8 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char16 SIMD_CFUNC simd_char_sat(simd_uint16 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char2 SIMD_CFUNC simd_char_sat(simd_ulong2 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char3 SIMD_CFUNC simd_char_sat(simd_ulong3 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char4 SIMD_CFUNC simd_char_sat(simd_ulong4 __x) { return simd_char(simd_min(__x,0x7f)); }
+static simd_char8 SIMD_CFUNC simd_char_sat(simd_ulong8 __x) { return simd_char(simd_min(__x,0x7f)); }
+
+
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_char2 __x) { return (simd_uchar2)__x; }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_char3 __x) { return (simd_uchar3)__x; }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_char4 __x) { return (simd_uchar4)__x; }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_char8 __x) { return (simd_uchar8)__x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_char16 __x) { return (simd_uchar16)__x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_char32 __x) { return (simd_uchar32)__x; }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_uchar2 __x) { return __x; }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_uchar3 __x) { return __x; }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_uchar4 __x) { return __x; }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_uchar8 __x) { return __x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uchar16 __x) { return __x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_uchar32 __x) { return __x; }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_short2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_short3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_short4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_short8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_short16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_short32 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_ushort2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_ushort3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_ushort4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_ushort8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_ushort16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar(simd_ushort32 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_int2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_int3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_int4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_int8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_int16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_uint2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_uint3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_uint4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_uint8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_uint16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_float2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_float3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_float4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_float8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar(simd_float16 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_long2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_long3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_long4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_long8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_ulong2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_ulong3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_ulong4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_ulong8 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar(simd_double2 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar(simd_double3 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar(simd_double4 __x) { return simd_uchar(simd_char(__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar(simd_double8 __x) { return simd_uchar(simd_char(__x)); }
+
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_char2 __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_char3 __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_char4 __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_char8 __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_char16 __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_char32 __x) { return simd_uchar(simd_max(0,__x)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_short2 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_short3 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_short4 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_short8 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_short16 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_short32 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_int2 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_int3 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_int4 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_int8 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_int16 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_float2 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_float3 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_float4 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_float8 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_float16 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_long2 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_long3 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_long4 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_long8 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_double2 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_double3 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_double4 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_double8 __x) { return simd_uchar(simd_clamp(__x,0,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_uchar2 __x) { return __x; }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_uchar3 __x) { return __x; }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_uchar4 __x) { return __x; }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_uchar8 __x) { return __x; }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uchar16 __x) { return __x; }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_uchar32 __x) { return __x; }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_ushort2 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_ushort3 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_ushort4 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_ushort8 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_ushort16 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar32 SIMD_CFUNC simd_uchar_sat(simd_ushort32 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_uint2 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_uint3 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_uint4 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_uint8 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar16 SIMD_CFUNC simd_uchar_sat(simd_uint16 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar2 SIMD_CFUNC simd_uchar_sat(simd_ulong2 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar3 SIMD_CFUNC simd_uchar_sat(simd_ulong3 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar4 SIMD_CFUNC simd_uchar_sat(simd_ulong4 __x) { return simd_uchar(simd_min(__x,0xff)); }
+static simd_uchar8 SIMD_CFUNC simd_uchar_sat(simd_ulong8 __x) { return simd_uchar(simd_min(__x,0xff)); }
+
+
+static simd_short2 SIMD_CFUNC simd_short(simd_char2 __x) { return __builtin_convertvector(__x, simd_short2); }
+static simd_short3 SIMD_CFUNC simd_short(simd_char3 __x) { return __builtin_convertvector(__x, simd_short3); }
+static simd_short4 SIMD_CFUNC simd_short(simd_char4 __x) { return __builtin_convertvector(__x, simd_short4); }
+static simd_short8 SIMD_CFUNC simd_short(simd_char8 __x) { return __builtin_convertvector(__x, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_char16 __x) { return __builtin_convertvector(__x, simd_short16); }
+static simd_short32 SIMD_CFUNC simd_short(simd_char32 __x) { return __builtin_convertvector(__x, simd_short32); }
+static simd_short2 SIMD_CFUNC simd_short(simd_uchar2 __x) { return __builtin_convertvector(__x, simd_short2); }
+static simd_short3 SIMD_CFUNC simd_short(simd_uchar3 __x) { return __builtin_convertvector(__x, simd_short3); }
+static simd_short4 SIMD_CFUNC simd_short(simd_uchar4 __x) { return __builtin_convertvector(__x, simd_short4); }
+static simd_short8 SIMD_CFUNC simd_short(simd_uchar8 __x) { return __builtin_convertvector(__x, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_uchar16 __x) { return __builtin_convertvector(__x, simd_short16); }
+static simd_short32 SIMD_CFUNC simd_short(simd_uchar32 __x) { return __builtin_convertvector(__x, simd_short32); }
+static simd_short2 SIMD_CFUNC simd_short(simd_short2 __x) { return __x; }
+static simd_short3 SIMD_CFUNC simd_short(simd_short3 __x) { return __x; }
+static simd_short4 SIMD_CFUNC simd_short(simd_short4 __x) { return __x; }
+static simd_short8 SIMD_CFUNC simd_short(simd_short8 __x) { return __x; }
+static simd_short16 SIMD_CFUNC simd_short(simd_short16 __x) { return __x; }
+static simd_short32 SIMD_CFUNC simd_short(simd_short32 __x) { return __x; }
+static simd_short2 SIMD_CFUNC simd_short(simd_ushort2 __x) { return (simd_short2)__x; }
+static simd_short3 SIMD_CFUNC simd_short(simd_ushort3 __x) { return (simd_short3)__x; }
+static simd_short4 SIMD_CFUNC simd_short(simd_ushort4 __x) { return (simd_short4)__x; }
+static simd_short8 SIMD_CFUNC simd_short(simd_ushort8 __x) { return (simd_short8)__x; }
+static simd_short16 SIMD_CFUNC simd_short(simd_ushort16 __x) { return (simd_short16)__x; }
+static simd_short32 SIMD_CFUNC simd_short(simd_ushort32 __x) { return (simd_short32)__x; }
+static simd_short2 SIMD_CFUNC simd_short(simd_int2 __x) { return __builtin_convertvector(__x & 0xffff, simd_short2); }
+static simd_short3 SIMD_CFUNC simd_short(simd_int3 __x) { return __builtin_convertvector(__x & 0xffff, simd_short3); }
+static simd_short4 SIMD_CFUNC simd_short(simd_int4 __x) { return __builtin_convertvector(__x & 0xffff, simd_short4); }
+static simd_short8 SIMD_CFUNC simd_short(simd_int8 __x) { return __builtin_convertvector(__x & 0xffff, simd_short8); }
+static simd_short16 SIMD_CFUNC simd_short(simd_int16 __x) { return __builtin_convertvector(__x & 0xffff, simd_short16); }
+static simd_short2 SIMD_CFUNC simd_short(simd_uint2 __x) { return simd_short(simd_int(__x)); }
+static simd_short3 SIMD_CFUNC simd_short(simd_uint3 __x) { return simd_short(simd_int(__x)); }
+static simd_short4 SIMD_CFUNC simd_short(simd_uint4 __x) { return simd_short(simd_int(__x)); }
+static simd_short8 SIMD_CFUNC simd_short(simd_uint8 __x) { return simd_short(simd_int(__x)); }
+static simd_short16 SIMD_CFUNC simd_short(simd_uint16 __x) { return simd_short(simd_int(__x)); }
+static simd_short2 SIMD_CFUNC simd_short(simd_float2 __x) { return simd_short(simd_int(__x)); }
+static simd_short3 SIMD_CFUNC simd_short(simd_float3 __x) { return simd_short(simd_int(__x)); }
+static simd_short4 SIMD_CFUNC simd_short(simd_float4 __x) { return simd_short(simd_int(__x)); }
+static simd_short8 SIMD_CFUNC simd_short(simd_float8 __x) { return simd_short(simd_int(__x)); }
+static simd_short16 SIMD_CFUNC simd_short(simd_float16 __x) { return simd_short(simd_int(__x)); }
+static simd_short2 SIMD_CFUNC simd_short(simd_long2 __x) { return simd_short(simd_int(__x)); }
+static simd_short3 SIMD_CFUNC simd_short(simd_long3 __x) { return simd_short(simd_int(__x)); }
+static simd_short4 SIMD_CFUNC simd_short(simd_long4 __x) { return simd_short(simd_int(__x)); }
+static simd_short8 SIMD_CFUNC simd_short(simd_long8 __x) { return simd_short(simd_int(__x)); }
+static simd_short2 SIMD_CFUNC simd_short(simd_ulong2 __x) { return simd_short(simd_int(__x)); }
+static simd_short3 SIMD_CFUNC simd_short(simd_ulong3 __x) { return simd_short(simd_int(__x)); }
+static simd_short4 SIMD_CFUNC simd_short(simd_ulong4 __x) { return simd_short(simd_int(__x)); }
+static simd_short8 SIMD_CFUNC simd_short(simd_ulong8 __x) { return simd_short(simd_int(__x)); }
+static simd_short2 SIMD_CFUNC simd_short(simd_double2 __x) { return simd_short(simd_int(__x)); }
+static simd_short3 SIMD_CFUNC simd_short(simd_double3 __x) { return simd_short(simd_int(__x)); }
+static simd_short4 SIMD_CFUNC simd_short(simd_double4 __x) { return simd_short(simd_int(__x)); }
+static simd_short8 SIMD_CFUNC simd_short(simd_double8 __x) { return simd_short(simd_int(__x)); }
+
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_char2 __x) { return simd_short(__x); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_char3 __x) { return simd_short(__x); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_char4 __x) { return simd_short(__x); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_char8 __x) { return simd_short(__x); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_char16 __x) { return simd_short(__x); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_char32 __x) { return simd_short(__x); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_short2 __x) { return __x; }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_short3 __x) { return __x; }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_short4 __x) { return __x; }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_short8 __x) { return __x; }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_short16 __x) { return __x; }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_short32 __x) { return __x; }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_int2 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_int3 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_int4 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_int8 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_int16 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_float2 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_float3 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_float4 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_float8 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_float16 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_long2 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_long3 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_long4 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_long8 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_double2 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_double3 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_double4 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_double8 __x) { return simd_short(simd_clamp(__x,-0x8000,0x7fff)); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_uchar2 __x) { return simd_short(__x); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_uchar3 __x) { return simd_short(__x); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_uchar4 __x) { return simd_short(__x); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_uchar8 __x) { return simd_short(__x); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uchar16 __x) { return simd_short(__x); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_uchar32 __x) { return simd_short(__x); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_ushort2 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_ushort3 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_ushort4 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_ushort8 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_ushort16 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short32 SIMD_CFUNC simd_short_sat(simd_ushort32 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_uint2 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_uint3 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_uint4 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_uint8 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short16 SIMD_CFUNC simd_short_sat(simd_uint16 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short2 SIMD_CFUNC simd_short_sat(simd_ulong2 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short3 SIMD_CFUNC simd_short_sat(simd_ulong3 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short4 SIMD_CFUNC simd_short_sat(simd_ulong4 __x) { return simd_short(simd_min(__x,0x7fff)); }
+static simd_short8 SIMD_CFUNC simd_short_sat(simd_ulong8 __x) { return simd_short(simd_min(__x,0x7fff)); }
+
+
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_char2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_char3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_char4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_char8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_char16 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_char32 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_uchar2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_uchar3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_uchar4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_uchar8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uchar16 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_uchar32 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_short2 __x) { return (simd_ushort2)__x; }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_short3 __x) { return (simd_ushort3)__x; }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_short4 __x) { return (simd_ushort4)__x; }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_short8 __x) { return (simd_ushort8)__x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_short16 __x) { return (simd_ushort16)__x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_short32 __x) { return (simd_ushort32)__x; }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_ushort2 __x) { return __x; }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_ushort3 __x) { return __x; }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_ushort4 __x) { return __x; }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_ushort8 __x) { return __x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_ushort16 __x) { return __x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort(simd_ushort32 __x) { return __x; }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_int2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_int3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_int4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_int8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_int16 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_uint2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_uint3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_uint4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_uint8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_uint16 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_float2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_float3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_float4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_float8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort(simd_float16 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_long2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_long3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_long4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_long8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_ulong2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_ulong3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_ulong4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_ulong8 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort(simd_double2 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort(simd_double3 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort(simd_double4 __x) { return simd_ushort(simd_short(__x)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort(simd_double8 __x) { return simd_ushort(simd_short(__x)); }
+
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_char2 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_char3 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_char4 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_char8 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_char16 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_char32 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_short2 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_short3 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_short4 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_short8 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_short16 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_short32 __x) { return simd_ushort(simd_max(__x, 0)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_int2 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_int3 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_int4 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_int8 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_int16 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_float2 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_float3 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_float4 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_float8 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_float16 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_long2 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_long3 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_long4 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_long8 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_double2 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_double3 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_double4 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_double8 __x) { return simd_ushort(simd_clamp(__x, 0, 0xffff)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_uchar2 __x) { return simd_ushort(__x); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_uchar3 __x) { return simd_ushort(__x); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_uchar4 __x) { return simd_ushort(__x); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_uchar8 __x) { return simd_ushort(__x); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uchar16 __x) { return simd_ushort(__x); }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_uchar32 __x) { return simd_ushort(__x); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_ushort2 __x) { return __x; }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_ushort3 __x) { return __x; }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_ushort4 __x) { return __x; }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_ushort8 __x) { return __x; }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_ushort16 __x) { return __x; }
+static simd_ushort32 SIMD_CFUNC simd_ushort_sat(simd_ushort32 __x) { return __x; }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_uint2 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_uint3 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_uint4 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_uint8 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort16 SIMD_CFUNC simd_ushort_sat(simd_uint16 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort2 SIMD_CFUNC simd_ushort_sat(simd_ulong2 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort3 SIMD_CFUNC simd_ushort_sat(simd_ulong3 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort4 SIMD_CFUNC simd_ushort_sat(simd_ulong4 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+static simd_ushort8 SIMD_CFUNC simd_ushort_sat(simd_ulong8 __x) { return simd_ushort(simd_min(__x, 0xffff)); }
+
+
+static simd_int2 SIMD_CFUNC simd_int(simd_char2 __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_char3 __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_char4 __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_char8 __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_char16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2 SIMD_CFUNC simd_int(simd_uchar2 __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_uchar3 __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_uchar4 __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_uchar8 __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_uchar16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2 SIMD_CFUNC simd_int(simd_short2 __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_short3 __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_short4 __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_short8 __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_short16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2 SIMD_CFUNC simd_int(simd_ushort2 __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_ushort3 __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_ushort4 __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_ushort8 __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_ushort16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2 SIMD_CFUNC simd_int(simd_int2 __x) { return __x; }
+static simd_int3 SIMD_CFUNC simd_int(simd_int3 __x) { return __x; }
+static simd_int4 SIMD_CFUNC simd_int(simd_int4 __x) { return __x; }
+static simd_int8 SIMD_CFUNC simd_int(simd_int8 __x) { return __x; }
+static simd_int16 SIMD_CFUNC simd_int(simd_int16 __x) { return __x; }
+static simd_int2 SIMD_CFUNC simd_int(simd_uint2 __x) { return (simd_int2)__x; }
+static simd_int3 SIMD_CFUNC simd_int(simd_uint3 __x) { return (simd_int3)__x; }
+static simd_int4 SIMD_CFUNC simd_int(simd_uint4 __x) { return (simd_int4)__x; }
+static simd_int8 SIMD_CFUNC simd_int(simd_uint8 __x) { return (simd_int8)__x; }
+static simd_int16 SIMD_CFUNC simd_int(simd_uint16 __x) { return (simd_int16)__x; }
+static simd_int2 SIMD_CFUNC simd_int(simd_float2 __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_float3 __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_float4 __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_float8 __x) { return __builtin_convertvector(__x, simd_int8); }
+static simd_int16 SIMD_CFUNC simd_int(simd_float16 __x) { return __builtin_convertvector(__x, simd_int16); }
+static simd_int2 SIMD_CFUNC simd_int(simd_long2 __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_long3 __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_long4 __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_long8 __x) { return __builtin_convertvector(__x & 0xffffffff, simd_int8); }
+static simd_int2 SIMD_CFUNC simd_int(simd_ulong2 __x) { return simd_int(simd_long(__x)); }
+static simd_int3 SIMD_CFUNC simd_int(simd_ulong3 __x) { return simd_int(simd_long(__x)); }
+static simd_int4 SIMD_CFUNC simd_int(simd_ulong4 __x) { return simd_int(simd_long(__x)); }
+static simd_int8 SIMD_CFUNC simd_int(simd_ulong8 __x) { return simd_int(simd_long(__x)); }
+static simd_int2 SIMD_CFUNC simd_int(simd_double2 __x) { return __builtin_convertvector(__x, simd_int2); }
+static simd_int3 SIMD_CFUNC simd_int(simd_double3 __x) { return __builtin_convertvector(__x, simd_int3); }
+static simd_int4 SIMD_CFUNC simd_int(simd_double4 __x) { return __builtin_convertvector(__x, simd_int4); }
+static simd_int8 SIMD_CFUNC simd_int(simd_double8 __x) { return __builtin_convertvector(__x, simd_int8); }
+
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_char2 __x) { return simd_int(__x); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_char3 __x) { return simd_int(__x); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_char4 __x) { return simd_int(__x); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_char8 __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_char16 __x) { return simd_int(__x); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_short2 __x) { return simd_int(__x); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_short3 __x) { return simd_int(__x); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_short4 __x) { return simd_int(__x); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_short8 __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_short16 __x) { return simd_int(__x); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_int2 __x) { return __x; }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_int3 __x) { return __x; }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_int4 __x) { return __x; }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_int8 __x) { return __x; }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_int16 __x) { return __x; }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_float2 __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_float3 __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_float4 __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_float8 __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_float16 __x) { return simd_bitselect(simd_int(simd_max(__x,-0x1.0p31f)), 0x7fffffff, __x >= 0x1.0p31f); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_long2 __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_long3 __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_long4 __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_long8 __x) { return simd_int(simd_clamp(__x,-0x80000000LL,0x7fffffffLL)); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_double2 __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_double3 __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_double4 __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_double8 __x) { return simd_int(simd_clamp(__x,-0x1.0p31,0x1.fffffffcp30)); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_uchar2 __x) { return simd_int(__x); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_uchar3 __x) { return simd_int(__x); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_uchar4 __x) { return simd_int(__x); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_uchar8 __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uchar16 __x) { return simd_int(__x); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_ushort2 __x) { return simd_int(__x); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_ushort3 __x) { return simd_int(__x); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_ushort4 __x) { return simd_int(__x); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_ushort8 __x) { return simd_int(__x); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_ushort16 __x) { return simd_int(__x); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_uint2 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_uint3 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_uint4 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_uint8 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int16 SIMD_CFUNC simd_int_sat(simd_uint16 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int2 SIMD_CFUNC simd_int_sat(simd_ulong2 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int3 SIMD_CFUNC simd_int_sat(simd_ulong3 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int4 SIMD_CFUNC simd_int_sat(simd_ulong4 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+static simd_int8 SIMD_CFUNC simd_int_sat(simd_ulong8 __x) { return simd_int(simd_min(__x,0x7fffffff)); }
+
+static simd_int2 SIMD_CFUNC simd_int_rte(simd_float2 __x) {
+#if defined __arm64__
+ return vcvtn_s32_f32(__x);
+#else
+ return simd_make_int2(simd_int_rte(simd_make_float4_undef(__x)));
+#endif
+}
+
+static simd_int3 SIMD_CFUNC simd_int_rte(simd_float3 __x) {
+ return simd_make_int3(simd_int_rte(simd_make_float4_undef(__x)));
+}
+
+static simd_int4 SIMD_CFUNC simd_int_rte(simd_float4 __x) {
+#if defined __SSE2__
+ return _mm_cvtps_epi32(__x);
+#elif defined __arm64__
+ return vcvtnq_s32_f32(__x);
+#else
+ simd_float4 magic = __tg_copysign(0x1.0p23, __x);
+ simd_int4 x_is_small = __tg_fabs(__x) < 0x1.0p23;
+ return __builtin_convertvector(simd_bitselect(__x, (__x + magic) - magic, x_is_small & 0x7fffffff), simd_int4);
+#endif
+}
+
+static simd_int8 SIMD_CFUNC simd_int_rte(simd_float8 __x) {
+#if defined __AVX__
+ return _mm256_cvtps_epi32(__x);
+#else
+ return simd_make_int8(simd_int_rte(__x.lo), simd_int_rte(__x.hi));
+#endif
+}
+
+static simd_int16 SIMD_CFUNC simd_int_rte(simd_float16 __x) {
+#if defined __AVX512F__
+ return _mm512_cvt_roundps_epi32(__x, _MM_FROUND_RINT);
+#else
+ return simd_make_int16(simd_int_rte(__x.lo), simd_int_rte(__x.hi));
+#endif
+}
+
+static simd_uint2 SIMD_CFUNC simd_uint(simd_char2 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_char3 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_char4 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_char8 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_char16 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_uchar2 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_uchar3 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_uchar4 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_uchar8 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uchar16 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_short2 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_short3 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_short4 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_short8 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_short16 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_ushort2 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_ushort3 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_ushort4 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_ushort8 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_ushort16 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_int2 __x) { return (simd_uint2)__x; }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_int3 __x) { return (simd_uint3)__x; }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_int4 __x) { return (simd_uint4)__x; }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_int8 __x) { return (simd_uint8)__x; }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_int16 __x) { return (simd_uint16)__x; }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_uint2 __x) { return __x; }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_uint3 __x) { return __x; }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_uint4 __x) { return __x; }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_uint8 __x) { return __x; }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_uint16 __x) { return __x; }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_float2 __x) { simd_int2 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float2)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint2)0,0x80000000,__big); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_float3 __x) { simd_int3 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float3)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint3)0,0x80000000,__big); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_float4 __x) { simd_int4 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float4)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint4)0,0x80000000,__big); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_float8 __x) { simd_int8 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float8)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint8)0,0x80000000,__big); }
+static simd_uint16 SIMD_CFUNC simd_uint(simd_float16 __x) { simd_int16 __big = __x > 0x1.0p31f; return simd_uint(simd_int(__x - simd_bitselect((simd_float16)0,0x1.0p31f,__big))) + simd_bitselect((simd_uint16)0,0x80000000,__big); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_long2 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_long3 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_long4 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_long8 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_ulong2 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_ulong3 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_ulong4 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_ulong8 __x) { return simd_uint(simd_int(__x)); }
+static simd_uint2 SIMD_CFUNC simd_uint(simd_double2 __x) { simd_long2 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double2)0,0x1.0p31,__big))) + simd_bitselect((simd_uint2)0,0x80000000,simd_int(__big)); }
+static simd_uint3 SIMD_CFUNC simd_uint(simd_double3 __x) { simd_long3 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double3)0,0x1.0p31,__big))) + simd_bitselect((simd_uint3)0,0x80000000,simd_int(__big)); }
+static simd_uint4 SIMD_CFUNC simd_uint(simd_double4 __x) { simd_long4 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double4)0,0x1.0p31,__big))) + simd_bitselect((simd_uint4)0,0x80000000,simd_int(__big)); }
+static simd_uint8 SIMD_CFUNC simd_uint(simd_double8 __x) { simd_long8 __big = __x > 0x1.fffffffcp30; return simd_uint(simd_int(__x - simd_bitselect((simd_double8)0,0x1.0p31,__big))) + simd_bitselect((simd_uint8)0,0x80000000,simd_int(__big)); }
+
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_char2 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_char3 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_char4 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_char8 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_char16 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_short2 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_short3 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_short4 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_short8 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_short16 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_int2 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_int3 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_int4 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_int8 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_int16 __x) { return simd_uint(simd_max(__x,0)); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_float2 __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_float3 __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_float4 __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_float8 __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_float16 __x) { return simd_bitselect(simd_uint(simd_max(__x,0)), 0xffffffff, __x >= 0x1.0p32f); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_long2 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_long3 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_long4 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_long8 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_double2 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_double3 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_double4 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_double8 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_uchar2 __x) { return simd_uint(__x); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_uchar3 __x) { return simd_uint(__x); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_uchar4 __x) { return simd_uint(__x); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_uchar8 __x) { return simd_uint(__x); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uchar16 __x) { return simd_uint(__x); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_ushort2 __x) { return simd_uint(__x); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_ushort3 __x) { return simd_uint(__x); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_ushort4 __x) { return simd_uint(__x); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_ushort8 __x) { return simd_uint(__x); }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_ushort16 __x) { return simd_uint(__x); }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_uint2 __x) { return __x; }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_uint3 __x) { return __x; }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_uint4 __x) { return __x; }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_uint8 __x) { return __x; }
+static simd_uint16 SIMD_CFUNC simd_uint_sat(simd_uint16 __x) { return __x; }
+static simd_uint2 SIMD_CFUNC simd_uint_sat(simd_ulong2 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint3 SIMD_CFUNC simd_uint_sat(simd_ulong3 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint4 SIMD_CFUNC simd_uint_sat(simd_ulong4 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+static simd_uint8 SIMD_CFUNC simd_uint_sat(simd_ulong8 __x) { return simd_uint(simd_clamp(__x,0,0xffffffff)); }
+
+
+static simd_float2 SIMD_CFUNC simd_float(simd_char2 __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3 SIMD_CFUNC simd_float(simd_char3 __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4 SIMD_CFUNC simd_float(simd_char4 __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8 SIMD_CFUNC simd_float(simd_char8 __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_char16 __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2 SIMD_CFUNC simd_float(simd_uchar2 __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3 SIMD_CFUNC simd_float(simd_uchar3 __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4 SIMD_CFUNC simd_float(simd_uchar4 __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8 SIMD_CFUNC simd_float(simd_uchar8 __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_uchar16 __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2 SIMD_CFUNC simd_float(simd_short2 __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3 SIMD_CFUNC simd_float(simd_short3 __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4 SIMD_CFUNC simd_float(simd_short4 __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8 SIMD_CFUNC simd_float(simd_short8 __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_short16 __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2 SIMD_CFUNC simd_float(simd_ushort2 __x) { return (simd_float2)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float3 SIMD_CFUNC simd_float(simd_ushort3 __x) { return (simd_float3)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float4 SIMD_CFUNC simd_float(simd_ushort4 __x) { return (simd_float4)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float8 SIMD_CFUNC simd_float(simd_ushort8 __x) { return (simd_float8)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float16 SIMD_CFUNC simd_float(simd_ushort16 __x) { return (simd_float16)(simd_int(__x) + 0x4b400000) - 0x1.8p23f; }
+static simd_float2 SIMD_CFUNC simd_float(simd_int2 __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3 SIMD_CFUNC simd_float(simd_int3 __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4 SIMD_CFUNC simd_float(simd_int4 __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8 SIMD_CFUNC simd_float(simd_int8 __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float16 SIMD_CFUNC simd_float(simd_int16 __x) { return __builtin_convertvector(__x,simd_float16); }
+static simd_float2 SIMD_CFUNC simd_float(simd_uint2 __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3 SIMD_CFUNC simd_float(simd_uint3 __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4 SIMD_CFUNC simd_float(simd_uint4 __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8 SIMD_CFUNC simd_float(simd_uint8 __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float16 SIMD_CFUNC simd_float(simd_uint16 __x) { return __builtin_convertvector(__x,simd_float16); }
+static simd_float2 SIMD_CFUNC simd_float(simd_float2 __x) { return __x; }
+static simd_float3 SIMD_CFUNC simd_float(simd_float3 __x) { return __x; }
+static simd_float4 SIMD_CFUNC simd_float(simd_float4 __x) { return __x; }
+static simd_float8 SIMD_CFUNC simd_float(simd_float8 __x) { return __x; }
+static simd_float16 SIMD_CFUNC simd_float(simd_float16 __x) { return __x; }
+static simd_float2 SIMD_CFUNC simd_float(simd_long2 __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3 SIMD_CFUNC simd_float(simd_long3 __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4 SIMD_CFUNC simd_float(simd_long4 __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8 SIMD_CFUNC simd_float(simd_long8 __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float2 SIMD_CFUNC simd_float(simd_ulong2 __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3 SIMD_CFUNC simd_float(simd_ulong3 __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4 SIMD_CFUNC simd_float(simd_ulong4 __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8 SIMD_CFUNC simd_float(simd_ulong8 __x) { return __builtin_convertvector(__x,simd_float8); }
+static simd_float2 SIMD_CFUNC simd_float(simd_double2 __x) { return __builtin_convertvector(__x,simd_float2); }
+static simd_float3 SIMD_CFUNC simd_float(simd_double3 __x) { return __builtin_convertvector(__x,simd_float3); }
+static simd_float4 SIMD_CFUNC simd_float(simd_double4 __x) { return __builtin_convertvector(__x,simd_float4); }
+static simd_float8 SIMD_CFUNC simd_float(simd_double8 __x) { return __builtin_convertvector(__x,simd_float8); }
+
+
+static simd_long2 SIMD_CFUNC simd_long(simd_char2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_char3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_char4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_char8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_uchar2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_uchar3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_uchar4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_uchar8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_short2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_short3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_short4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_short8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_ushort2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_ushort3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_ushort4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_ushort8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_int2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_int3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_int4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_int8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_uint2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_uint3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_uint4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_uint8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_float2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_float3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_float4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_float8 __x) { return __builtin_convertvector(__x,simd_long8); }
+static simd_long2 SIMD_CFUNC simd_long(simd_long2 __x) { return __x; }
+static simd_long3 SIMD_CFUNC simd_long(simd_long3 __x) { return __x; }
+static simd_long4 SIMD_CFUNC simd_long(simd_long4 __x) { return __x; }
+static simd_long8 SIMD_CFUNC simd_long(simd_long8 __x) { return __x; }
+static simd_long2 SIMD_CFUNC simd_long(simd_ulong2 __x) { return (simd_long2)__x; }
+static simd_long3 SIMD_CFUNC simd_long(simd_ulong3 __x) { return (simd_long3)__x; }
+static simd_long4 SIMD_CFUNC simd_long(simd_ulong4 __x) { return (simd_long4)__x; }
+static simd_long8 SIMD_CFUNC simd_long(simd_ulong8 __x) { return (simd_long8)__x; }
+static simd_long2 SIMD_CFUNC simd_long(simd_double2 __x) { return __builtin_convertvector(__x,simd_long2); }
+static simd_long3 SIMD_CFUNC simd_long(simd_double3 __x) { return __builtin_convertvector(__x,simd_long3); }
+static simd_long4 SIMD_CFUNC simd_long(simd_double4 __x) { return __builtin_convertvector(__x,simd_long4); }
+static simd_long8 SIMD_CFUNC simd_long(simd_double8 __x) { return __builtin_convertvector(__x,simd_long8); }
+
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_char2 __x) { return simd_long(__x); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_char3 __x) { return simd_long(__x); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_char4 __x) { return simd_long(__x); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_char8 __x) { return simd_long(__x); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_short2 __x) { return simd_long(__x); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_short3 __x) { return simd_long(__x); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_short4 __x) { return simd_long(__x); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_short8 __x) { return simd_long(__x); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_int2 __x) { return simd_long(__x); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_int3 __x) { return simd_long(__x); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_int4 __x) { return simd_long(__x); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_int8 __x) { return simd_long(__x); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_float2 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_float3 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_float4 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_float8 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63f)), 0x7fffffffffffffff, simd_long(__x >= 0x1.0p63f)); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_long2 __x) { return __x; }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_long3 __x) { return __x; }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_long4 __x) { return __x; }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_long8 __x) { return __x; }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_double2 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_double3 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_double4 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_double8 __x) { return simd_bitselect(simd_long(simd_max(__x,-0x1.0p63)), 0x7fffffffffffffff, __x >= 0x1.0p63); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_uchar2 __x) { return simd_long(__x); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_uchar3 __x) { return simd_long(__x); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_uchar4 __x) { return simd_long(__x); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_uchar8 __x) { return simd_long(__x); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_ushort2 __x) { return simd_long(__x); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_ushort3 __x) { return simd_long(__x); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_ushort4 __x) { return simd_long(__x); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_ushort8 __x) { return simd_long(__x); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_uint2 __x) { return simd_long(__x); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_uint3 __x) { return simd_long(__x); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_uint4 __x) { return simd_long(__x); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_uint8 __x) { return simd_long(__x); }
+static simd_long2 SIMD_CFUNC simd_long_sat(simd_ulong2 __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long3 SIMD_CFUNC simd_long_sat(simd_ulong3 __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long4 SIMD_CFUNC simd_long_sat(simd_ulong4 __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+static simd_long8 SIMD_CFUNC simd_long_sat(simd_ulong8 __x) { return simd_long(simd_min(__x,0x7fffffffffffffff)); }
+
+static simd_long2 SIMD_CFUNC simd_long_rte(simd_double2 __x) {
+#if defined __AVX512F__
+ return _mm_cvtpd_epi64(__x);
+#elif defined __arm64__
+ return vcvtnq_s64_f64(__x);
+#else
+ simd_double2 magic = __tg_copysign(0x1.0p52, __x);
+ simd_long2 x_is_small = __tg_fabs(__x) < 0x1.0p52;
+ return __builtin_convertvector(simd_bitselect(__x, (__x + magic) - magic, x_is_small & 0x7fffffffffffffff), simd_long2);
+#endif
+}
+
+static simd_long3 SIMD_CFUNC simd_long_rte(simd_double3 __x) {
+ return simd_make_long3(simd_long_rte(simd_make_double4_undef(__x)));
+}
+
+static simd_long4 SIMD_CFUNC simd_long_rte(simd_double4 __x) {
+#if defined __AVX512F__
+ return _mm256_cvtpd_epi64(__x);
+#else
+ return simd_make_long4(simd_long_rte(__x.lo), simd_long_rte(__x.hi));
+#endif
+}
+
+static simd_long8 SIMD_CFUNC simd_long_rte(simd_double8 __x) {
+#if defined __AVX512F__
+ return _mm512_cvt_roundpd_epi64(__x, _MM_FROUND_RINT);
+#else
+ return simd_make_long8(simd_long_rte(__x.lo), simd_long_rte(__x.hi));
+#endif
+}
+
+
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_char2 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_char3 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_char4 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_char8 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_uchar2 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_uchar3 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_uchar4 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_uchar8 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_short2 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_short3 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_short4 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_short8 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_ushort2 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_ushort3 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_ushort4 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_ushort8 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_int2 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_int3 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_int4 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_int8 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_uint2 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_uint3 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_uint4 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_uint8 __x) { return simd_ulong(simd_long(__x)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_float2 __x) { simd_int2 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float2)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong2)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_float3 __x) { simd_int3 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float3)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong3)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_float4 __x) { simd_int4 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float4)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong4)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_float8 __x) { simd_int8 __big = __x >= 0x1.0p63f; return simd_ulong(simd_long(__x - simd_bitselect((simd_float8)0,0x1.0p63f,__big))) + simd_bitselect((simd_ulong8)0,0x8000000000000000,simd_long(__big)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_long2 __x) { return (simd_ulong2)__x; }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_long3 __x) { return (simd_ulong3)__x; }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_long4 __x) { return (simd_ulong4)__x; }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_long8 __x) { return (simd_ulong8)__x; }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_ulong2 __x) { return __x; }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_ulong3 __x) { return __x; }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_ulong4 __x) { return __x; }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_ulong8 __x) { return __x; }
+static simd_ulong2 SIMD_CFUNC simd_ulong(simd_double2 __x) { simd_long2 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double2)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong2)0,0x8000000000000000,__big); }
+static simd_ulong3 SIMD_CFUNC simd_ulong(simd_double3 __x) { simd_long3 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double3)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong3)0,0x8000000000000000,__big); }
+static simd_ulong4 SIMD_CFUNC simd_ulong(simd_double4 __x) { simd_long4 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double4)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong4)0,0x8000000000000000,__big); }
+static simd_ulong8 SIMD_CFUNC simd_ulong(simd_double8 __x) { simd_long8 __big = __x >= 0x1.0p63; return simd_ulong(simd_long(__x - simd_bitselect((simd_double8)0,0x1.0p63,__big))) + simd_bitselect((simd_ulong8)0,0x8000000000000000,__big); }
+
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_char2 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_char3 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_char4 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_char8 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_short2 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_short3 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_short4 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_short8 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_int2 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_int3 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_int4 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_int8 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_float2 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_float3 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_float4 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_float8 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.f)), 0xffffffffffffffff, simd_long(__x >= 0x1.0p64f)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_long2 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_long3 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_long4 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_long8 __x) { return simd_ulong(simd_max(__x,0)); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_double2 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_double3 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_double4 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_double8 __x) { return simd_bitselect(simd_ulong(simd_max(__x,0.0)), 0xffffffffffffffff, __x >= 0x1.0p64); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_uchar2 __x) { return simd_ulong(__x); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_uchar3 __x) { return simd_ulong(__x); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_uchar4 __x) { return simd_ulong(__x); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_uchar8 __x) { return simd_ulong(__x); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_ushort2 __x) { return simd_ulong(__x); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_ushort3 __x) { return simd_ulong(__x); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_ushort4 __x) { return simd_ulong(__x); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_ushort8 __x) { return simd_ulong(__x); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_uint2 __x) { return simd_ulong(__x); }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_uint3 __x) { return simd_ulong(__x); }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_uint4 __x) { return simd_ulong(__x); }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_uint8 __x) { return simd_ulong(__x); }
+static simd_ulong2 SIMD_CFUNC simd_ulong_sat(simd_ulong2 __x) { return __x; }
+static simd_ulong3 SIMD_CFUNC simd_ulong_sat(simd_ulong3 __x) { return __x; }
+static simd_ulong4 SIMD_CFUNC simd_ulong_sat(simd_ulong4 __x) { return __x; }
+static simd_ulong8 SIMD_CFUNC simd_ulong_sat(simd_ulong8 __x) { return __x; }
+
+
+static simd_double2 SIMD_CFUNC simd_double(simd_char2 __x) { return simd_double(simd_int(__x)); }
+static simd_double3 SIMD_CFUNC simd_double(simd_char3 __x) { return simd_double(simd_int(__x)); }
+static simd_double4 SIMD_CFUNC simd_double(simd_char4 __x) { return simd_double(simd_int(__x)); }
+static simd_double8 SIMD_CFUNC simd_double(simd_char8 __x) { return simd_double(simd_int(__x)); }
+static simd_double2 SIMD_CFUNC simd_double(simd_uchar2 __x) { return simd_double(simd_int(__x)); }
+static simd_double3 SIMD_CFUNC simd_double(simd_uchar3 __x) { return simd_double(simd_int(__x)); }
+static simd_double4 SIMD_CFUNC simd_double(simd_uchar4 __x) { return simd_double(simd_int(__x)); }
+static simd_double8 SIMD_CFUNC simd_double(simd_uchar8 __x) { return simd_double(simd_int(__x)); }
+static simd_double2 SIMD_CFUNC simd_double(simd_short2 __x) { return simd_double(simd_int(__x)); }
+static simd_double3 SIMD_CFUNC simd_double(simd_short3 __x) { return simd_double(simd_int(__x)); }
+static simd_double4 SIMD_CFUNC simd_double(simd_short4 __x) { return simd_double(simd_int(__x)); }
+static simd_double8 SIMD_CFUNC simd_double(simd_short8 __x) { return simd_double(simd_int(__x)); }
+static simd_double2 SIMD_CFUNC simd_double(simd_ushort2 __x) { return simd_double(simd_int(__x)); }
+static simd_double3 SIMD_CFUNC simd_double(simd_ushort3 __x) { return simd_double(simd_int(__x)); }
+static simd_double4 SIMD_CFUNC simd_double(simd_ushort4 __x) { return simd_double(simd_int(__x)); }
+static simd_double8 SIMD_CFUNC simd_double(simd_ushort8 __x) { return simd_double(simd_int(__x)); }
+static simd_double2 SIMD_CFUNC simd_double(simd_int2 __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3 SIMD_CFUNC simd_double(simd_int3 __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4 SIMD_CFUNC simd_double(simd_int4 __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8 SIMD_CFUNC simd_double(simd_int8 __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2 SIMD_CFUNC simd_double(simd_uint2 __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3 SIMD_CFUNC simd_double(simd_uint3 __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4 SIMD_CFUNC simd_double(simd_uint4 __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8 SIMD_CFUNC simd_double(simd_uint8 __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2 SIMD_CFUNC simd_double(simd_float2 __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3 SIMD_CFUNC simd_double(simd_float3 __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4 SIMD_CFUNC simd_double(simd_float4 __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8 SIMD_CFUNC simd_double(simd_float8 __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2 SIMD_CFUNC simd_double(simd_long2 __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3 SIMD_CFUNC simd_double(simd_long3 __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4 SIMD_CFUNC simd_double(simd_long4 __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8 SIMD_CFUNC simd_double(simd_long8 __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2 SIMD_CFUNC simd_double(simd_ulong2 __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3 SIMD_CFUNC simd_double(simd_ulong3 __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4 SIMD_CFUNC simd_double(simd_ulong4 __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8 SIMD_CFUNC simd_double(simd_ulong8 __x) { return __builtin_convertvector(__x, simd_double8); }
+static simd_double2 SIMD_CFUNC simd_double(simd_double2 __x) { return __builtin_convertvector(__x, simd_double2); }
+static simd_double3 SIMD_CFUNC simd_double(simd_double3 __x) { return __builtin_convertvector(__x, simd_double3); }
+static simd_double4 SIMD_CFUNC simd_double(simd_double4 __x) { return __builtin_convertvector(__x, simd_double4); }
+static simd_double8 SIMD_CFUNC simd_double(simd_double8 __x) { return __builtin_convertvector(__x, simd_double8); }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif // SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#endif // __SIMD_CONVERSION_HEADER__ \ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/logic.h b/lib/libc/include/aarch64-macos-gnu/simd/logic.h
new file mode 100644
index 0000000000..fdefcb632d
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/logic.h
@@ -0,0 +1,1315 @@
+/*! @header
+ * The interfaces declared in this header provide logical and bitwise
+ * operations on vectors. Some of these function operate elementwise,
+ * and some produce a scalar result that depends on all lanes of the input.
+ *
+ * For functions returning a boolean value, the return type in C and
+ * Objective-C is _Bool; for C++ it is bool.
+ *
+ * Function Result
+ * ------------------------------------------------------------------
+ * simd_all(comparison) True if and only if the comparison is true
+ * in every vector lane. e.g.:
+ *
+ * if (simd_all(x == 0.0f)) {
+ * // executed if every lane of x
+ * // contains zero.
+ * }
+ *
+ * The precise function of simd_all is to
+ * return the high-order bit of the result
+ * of a horizontal bitwise AND of all vector
+ * lanes.
+ *
+ * simd_any(comparison) True if and only if the comparison is true
+ * in at least one vector lane. e.g.:
+ *
+ * if (simd_any(x < 0.0f)) {
+ * // executed if any lane of x
+ * // contains a negative value.
+ * }
+ *
+ * The precise function of simd_all is to
+ * return the high-order bit of the result
+ * of a horizontal bitwise OR of all vector
+ * lanes.
+ *
+ * simd_select(x,y,mask) For each lane in the result, selects the
+ * corresponding element of x if the high-
+ * order bit of the corresponding element of
+ * mask is 0, and the corresponding element
+ * of y otherwise.
+ *
+ * simd_bitselect(x,y,mask) For each bit in the result, selects the
+ * corresponding bit of x if the corresponding
+ * bit of mask is clear, and the corresponding
+ * of y otherwise.
+ *
+ * In C++, these functions are available under the simd:: namespace:
+ *
+ * C++ Function Equivalent C Function
+ * --------------------------------------------------------------------
+ * simd::all(comparison) simd_all(comparison)
+ * simd::any(comparison) simd_any(comparison)
+ * simd::select(x,y,mask) simd_select(x,y,mask)
+ * simd::bitselect(x,y,mask) simd_bitselect(x,y,mask)
+ *
+ * @copyright 2014-2017 Apple, Inc. All rights reserved.
+ * @unsorted */
+
+#ifndef SIMD_LOGIC_HEADER
+#define SIMD_LOGIC_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_char64 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar64 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_short32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort32 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_int16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint16 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_long8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong2 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong3 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong4 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong8 x);
+/*! @abstract True if and only if the high-order bit of any lane of the
+ * vector is set.
+ * @discussion Deprecated. Use simd_any instead. */
+#define vector_any simd_any
+
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_char64 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar64 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_short32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort32 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_int16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint16 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_long8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong2 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong3 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong4 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set. */
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong8 x);
+/*! @abstract True if and only if the high-order bit of every lane of the
+ * vector is set.
+ * @discussion Deprecated. Use simd_all instead. */
+#define vector_all simd_all
+
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_float2 simd_select(simd_float2 x, simd_float2 y, simd_int2 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_float3 simd_select(simd_float3 x, simd_float3 y, simd_int3 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_float4 simd_select(simd_float4 x, simd_float4 y, simd_int4 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_float8 simd_select(simd_float8 x, simd_float8 y, simd_int8 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_float16 simd_select(simd_float16 x, simd_float16 y, simd_int16 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_double2 simd_select(simd_double2 x, simd_double2 y, simd_long2 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_double3 simd_select(simd_double3 x, simd_double3 y, simd_long3 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_double4 simd_select(simd_double4 x, simd_double4 y, simd_long4 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+static inline SIMD_CFUNC simd_double8 simd_select(simd_double8 x, simd_double8 y, simd_long8 mask);
+/*! @abstract For each lane in the result, selects the corresponding element
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively.
+ * @discussion Deprecated. Use simd_select instead. */
+#define vector_select simd_select
+
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char2 simd_bitselect(simd_char2 x, simd_char2 y, simd_char2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char3 simd_bitselect(simd_char3 x, simd_char3 y, simd_char3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char4 simd_bitselect(simd_char4 x, simd_char4 y, simd_char4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char8 simd_bitselect(simd_char8 x, simd_char8 y, simd_char8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char16 simd_bitselect(simd_char16 x, simd_char16 y, simd_char16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char32 simd_bitselect(simd_char32 x, simd_char32 y, simd_char32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_char64 simd_bitselect(simd_char64 x, simd_char64 y, simd_char64 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar2 simd_bitselect(simd_uchar2 x, simd_uchar2 y, simd_char2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar3 simd_bitselect(simd_uchar3 x, simd_uchar3 y, simd_char3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar4 simd_bitselect(simd_uchar4 x, simd_uchar4 y, simd_char4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar8 simd_bitselect(simd_uchar8 x, simd_uchar8 y, simd_char8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar16 simd_bitselect(simd_uchar16 x, simd_uchar16 y, simd_char16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar32 simd_bitselect(simd_uchar32 x, simd_uchar32 y, simd_char32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uchar64 simd_bitselect(simd_uchar64 x, simd_uchar64 y, simd_char64 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_short2 simd_bitselect(simd_short2 x, simd_short2 y, simd_short2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_short3 simd_bitselect(simd_short3 x, simd_short3 y, simd_short3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_short4 simd_bitselect(simd_short4 x, simd_short4 y, simd_short4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_short8 simd_bitselect(simd_short8 x, simd_short8 y, simd_short8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_short16 simd_bitselect(simd_short16 x, simd_short16 y, simd_short16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_short32 simd_bitselect(simd_short32 x, simd_short32 y, simd_short32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ushort2 simd_bitselect(simd_ushort2 x, simd_ushort2 y, simd_short2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ushort3 simd_bitselect(simd_ushort3 x, simd_ushort3 y, simd_short3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ushort4 simd_bitselect(simd_ushort4 x, simd_ushort4 y, simd_short4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ushort8 simd_bitselect(simd_ushort8 x, simd_ushort8 y, simd_short8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ushort16 simd_bitselect(simd_ushort16 x, simd_ushort16 y, simd_short16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ushort32 simd_bitselect(simd_ushort32 x, simd_ushort32 y, simd_short32 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_int2 simd_bitselect(simd_int2 x, simd_int2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_int3 simd_bitselect(simd_int3 x, simd_int3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_int4 simd_bitselect(simd_int4 x, simd_int4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_int8 simd_bitselect(simd_int8 x, simd_int8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_int16 simd_bitselect(simd_int16 x, simd_int16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uint2 simd_bitselect(simd_uint2 x, simd_uint2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uint3 simd_bitselect(simd_uint3 x, simd_uint3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uint4 simd_bitselect(simd_uint4 x, simd_uint4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uint8 simd_bitselect(simd_uint8 x, simd_uint8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_uint16 simd_bitselect(simd_uint16 x, simd_uint16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_float2 simd_bitselect(simd_float2 x, simd_float2 y, simd_int2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_float3 simd_bitselect(simd_float3 x, simd_float3 y, simd_int3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_float4 simd_bitselect(simd_float4 x, simd_float4 y, simd_int4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_float8 simd_bitselect(simd_float8 x, simd_float8 y, simd_int8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_float16 simd_bitselect(simd_float16 x, simd_float16 y, simd_int16 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_long2 simd_bitselect(simd_long2 x, simd_long2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_long3 simd_bitselect(simd_long3 x, simd_long3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_long4 simd_bitselect(simd_long4 x, simd_long4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_long8 simd_bitselect(simd_long8 x, simd_long8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ulong2 simd_bitselect(simd_ulong2 x, simd_ulong2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ulong3 simd_bitselect(simd_ulong3 x, simd_ulong3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ulong4 simd_bitselect(simd_ulong4 x, simd_ulong4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_ulong8 simd_bitselect(simd_ulong8 x, simd_ulong8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_double2 simd_bitselect(simd_double2 x, simd_double2 y, simd_long2 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_double3 simd_bitselect(simd_double3 x, simd_double3 y, simd_long3 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_double4 simd_bitselect(simd_double4 x, simd_double4 y, simd_long4 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+static inline SIMD_CFUNC simd_double8 simd_bitselect(simd_double8 x, simd_double8 y, simd_long8 mask);
+/*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively.
+ * @discussion Deprecated. Use simd_bitselect instead. */
+#define vector_bitselect simd_bitselect
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace simd {
+ /*! @abstract True if and only if the high-order bit of every lane is set. */
+ template <typename inttypeN> static SIMD_CPPFUNC simd_bool all(const inttypeN predicate) { return ::simd_all(predicate); }
+ /*! @abstract True if and only if the high-order bit of any lane is set. */
+ template <typename inttypeN> static SIMD_CPPFUNC simd_bool any(const inttypeN predicate) { return ::simd_any(predicate); }
+ /*! @abstract Each lane of the result is selected from the corresponding lane
+ * of x or y according to whether the high-order bit of the corresponding
+ * lane of mask is 0 or 1, respectively. */
+ template <typename inttypeN, typename fptypeN> static SIMD_CPPFUNC fptypeN select(const fptypeN x, const fptypeN y, const inttypeN predicate) { return ::simd_select(x,y,predicate); }
+ /*! @abstract For each bit in the result, selects the corresponding bit of x
+ * or y according to whether the corresponding bit of mask is 0 or 1,
+ * respectively. */
+ template <typename inttypeN, typename typeN> static SIMD_CPPFUNC typeN bitselect(const typeN x, const typeN y, const inttypeN mask) { return ::simd_bitselect(x,y,mask); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+#pragma mark - Implementations
+
+static inline SIMD_CFUNC simd_bool simd_any(simd_char2 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x3);
+#elif defined __arm64__
+ return simd_any(x.xyxy);
+#else
+ union { uint16_t i; simd_char2 v; } u = { .v = x };
+ return (u.i & 0x8080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char3 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x7);
+#elif defined __arm64__
+ return simd_any(x.xyzz);
+#else
+ union { uint32_t i; simd_char3 v; } u = { .v = x };
+ return (u.i & 0x808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char4 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xf);
+#elif defined __arm64__
+ return simd_any(x.xyzwxyzw);
+#else
+ union { uint32_t i; simd_char4 v; } u = { .v = x };
+ return (u.i & 0x80808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char8 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xff);
+#elif defined __arm64__
+ return vmaxv_u8(x) & 0x80;
+#else
+ union { uint64_t i; simd_char8 v; } u = { .v = x };
+ return (u.i & 0x8080808080808080);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char16 x) {
+#if defined __SSE2__
+ return _mm_movemask_epi8((__m128i)x);
+#elif defined __arm64__
+ return vmaxvq_u8(x) & 0x80;
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char32 x) {
+#if defined __AVX2__
+ return _mm256_movemask_epi8(x);
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_char64 x) {
+ return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar2 x) {
+ return simd_any((simd_char2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar3 x) {
+ return simd_any((simd_char3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar4 x) {
+ return simd_any((simd_char4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar8 x) {
+ return simd_any((simd_char8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar16 x) {
+ return simd_any((simd_char16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar32 x) {
+ return simd_any((simd_char32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uchar64 x) {
+ return simd_any((simd_char64)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short2 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xa);
+#elif defined __arm64__
+ return simd_any(x.xyxy);
+#else
+ union { uint32_t i; simd_short2 v; } u = { .v = x };
+ return (u.i & 0x80008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short3 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0x2a);
+#elif defined __arm64__
+ return simd_any(x.xyzz);
+#else
+ union { uint64_t i; simd_short3 v; } u = { .v = x };
+ return (u.i & 0x800080008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short4 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xaa);
+#elif defined __arm64__
+ return vmaxv_u16(x) & 0x8000;
+#else
+ union { uint64_t i; simd_short4 v; } u = { .v = x };
+ return (u.i & 0x8000800080008000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short8 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)x) & 0xaaaa);
+#elif defined __arm64__
+ return vmaxvq_u16(x) & 0x8000;
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short16 x) {
+#if defined __AVX2__
+ return (_mm256_movemask_epi8(x) & 0xaaaaaaaa);
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_short32 x) {
+ return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort2 x) {
+ return simd_any((simd_short2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort3 x) {
+ return simd_any((simd_short3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort4 x) {
+ return simd_any((simd_short4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort8 x) {
+ return simd_any((simd_short8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort16 x) {
+ return simd_any((simd_short16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ushort32 x) {
+ return simd_any((simd_short32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int2 x) {
+#if defined __SSE2__
+ return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x3);
+#elif defined __arm64__
+ return vmaxv_u32(x) & 0x80000000;
+#else
+ union { uint64_t i; simd_int2 v; } u = { .v = x };
+ return (u.i & 0x8000000080000000);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int3 x) {
+#if defined __SSE2__
+ return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x7);
+#elif defined __arm64__
+ return simd_any(x.xyzz);
+#else
+ return (x.x | x.y | x.z) & 0x80000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int4 x) {
+#if defined __SSE2__
+ return _mm_movemask_ps((__m128)x);
+#elif defined __arm64__
+ return vmaxvq_u32(x) & 0x80000000;
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int8 x) {
+#if defined __AVX__
+ return _mm256_movemask_ps(x);
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_int16 x) {
+ return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint2 x) {
+ return simd_any((simd_int2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint3 x) {
+ return simd_any((simd_int3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint4 x) {
+ return simd_any((simd_int4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint8 x) {
+ return simd_any((simd_int8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_uint16 x) {
+ return simd_any((simd_int16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long2 x) {
+#if defined __SSE2__
+ return _mm_movemask_pd((__m128d)x);
+#elif defined __arm64__
+ return (x.x | x.y) & 0x8000000000000000U;
+#else
+ return (x.x | x.y) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long3 x) {
+#if defined __AVX__
+ return (_mm256_movemask_pd(simd_make_long4_undef(x)) & 0x7);
+#else
+ return (x.x | x.y | x.z) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long4 x) {
+#if defined __AVX__
+ return _mm256_movemask_pd(x);
+#else
+ return simd_any(x.lo | x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_long8 x) {
+ return simd_any(x.lo | x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong2 x) {
+ return simd_any((simd_long2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong3 x) {
+ return simd_any((simd_long3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong4 x) {
+ return simd_any((simd_long4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_any(simd_ulong8 x) {
+ return simd_any((simd_long8)x);
+}
+
+static inline SIMD_CFUNC simd_bool simd_all(simd_char2 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x3) == 0x3;
+#elif defined __arm64__
+ return simd_all(x.xyxy);
+#else
+ union { uint16_t i; simd_char2 v; } u = { .v = x };
+ return (u.i & 0x8080) == 0x8080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char3 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0x7) == 0x7;
+#elif defined __arm64__
+ return simd_all(x.xyzz);
+#else
+ union { uint32_t i; simd_char3 v; } u = { .v = x };
+ return (u.i & 0x808080) == 0x808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char4 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xf) == 0xf;
+#elif defined __arm64__
+ return simd_all(x.xyzwxyzw);
+#else
+ union { uint32_t i; simd_char4 v; } u = { .v = x };
+ return (u.i & 0x80808080) == 0x80808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char8 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_char16_undef(x)) & 0xff) == 0xff;
+#elif defined __arm64__
+ return vminv_u8(x) & 0x80;
+#else
+ union { uint64_t i; simd_char8 v; } u = { .v = x };
+ return (u.i & 0x8080808080808080) == 0x8080808080808080;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char16 x) {
+#if defined __SSE2__
+ return _mm_movemask_epi8((__m128i)x) == 0xffff;
+#elif defined __arm64__
+ return vminvq_u8(x) & 0x80;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char32 x) {
+#if defined __AVX2__
+ return _mm256_movemask_epi8(x) == 0xffffffff;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_char64 x) {
+ return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar2 x) {
+ return simd_all((simd_char2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar3 x) {
+ return simd_all((simd_char3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar4 x) {
+ return simd_all((simd_char4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar8 x) {
+ return simd_all((simd_char8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar16 x) {
+ return simd_all((simd_char16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar32 x) {
+ return simd_all((simd_char32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uchar64 x) {
+ return simd_all((simd_char64)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short2 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xa) == 0xa;
+#elif defined __arm64__
+ return simd_all(x.xyxy);
+#else
+ union { uint32_t i; simd_short2 v; } u = { .v = x };
+ return (u.i & 0x80008000) == 0x80008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short3 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0x2a) == 0x2a;
+#elif defined __arm64__
+ return simd_all(x.xyzz);
+#else
+ union { uint64_t i; simd_short3 v; } u = { .v = x };
+ return (u.i & 0x800080008000) == 0x800080008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short4 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)simd_make_short8_undef(x)) & 0xaa) == 0xaa;
+#elif defined __arm64__
+ return vminv_u16(x) & 0x8000;
+#else
+ union { uint64_t i; simd_short4 v; } u = { .v = x };
+ return (u.i & 0x8000800080008000) == 0x8000800080008000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short8 x) {
+#if defined __SSE2__
+ return (_mm_movemask_epi8((__m128i)x) & 0xaaaa) == 0xaaaa;
+#elif defined __arm64__
+ return vminvq_u16(x) & 0x8000;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short16 x) {
+#if defined __AVX2__
+ return (_mm256_movemask_epi8(x) & 0xaaaaaaaa) == 0xaaaaaaaa;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_short32 x) {
+ return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort2 x) {
+ return simd_all((simd_short2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort3 x) {
+ return simd_all((simd_short3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort4 x) {
+ return simd_all((simd_short4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort8 x) {
+ return simd_all((simd_short8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort16 x) {
+ return simd_all((simd_short16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ushort32 x) {
+ return simd_all((simd_short32)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int2 x) {
+#if defined __SSE2__
+ return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x3) == 0x3;
+#elif defined __arm64__
+ return vminv_u32(x) & 0x80000000;
+#else
+ union { uint64_t i; simd_int2 v; } u = { .v = x };
+ return (u.i & 0x8000000080000000) == 0x8000000080000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int3 x) {
+#if defined __SSE2__
+ return (_mm_movemask_ps((__m128)simd_make_int4_undef(x)) & 0x7) == 0x7;
+#elif defined __arm64__
+ return simd_all(x.xyzz);
+#else
+ return (x.x & x.y & x.z) & 0x80000000;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int4 x) {
+#if defined __SSE2__
+ return _mm_movemask_ps((__m128)x) == 0xf;
+#elif defined __arm64__
+ return vminvq_u32(x) & 0x80000000;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int8 x) {
+#if defined __AVX__
+ return _mm256_movemask_ps(x) == 0xff;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_int16 x) {
+ return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint2 x) {
+ return simd_all((simd_int2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint3 x) {
+ return simd_all((simd_int3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint4 x) {
+ return simd_all((simd_int4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint8 x) {
+ return simd_all((simd_int8)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_uint16 x) {
+ return simd_all((simd_int16)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long2 x) {
+#if defined __SSE2__
+ return _mm_movemask_pd((__m128d)x) == 0x3;
+#elif defined __arm64__
+ return (x.x & x.y) & 0x8000000000000000U;
+#else
+ return (x.x & x.y) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long3 x) {
+#if defined __AVX__
+ return (_mm256_movemask_pd(simd_make_long4_undef(x)) & 0x7) == 0x7;
+#else
+ return (x.x & x.y & x.z) & 0x8000000000000000U;
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long4 x) {
+#if defined __AVX__
+ return _mm256_movemask_pd(x) == 0xf;
+#else
+ return simd_all(x.lo & x.hi);
+#endif
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_long8 x) {
+ return simd_all(x.lo & x.hi);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong2 x) {
+ return simd_all((simd_long2)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong3 x) {
+ return simd_all((simd_long3)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong4 x) {
+ return simd_all((simd_long4)x);
+}
+static inline SIMD_CFUNC simd_bool simd_all(simd_ulong8 x) {
+ return simd_all((simd_long8)x);
+}
+
+static inline SIMD_CFUNC simd_float2 simd_select(simd_float2 x, simd_float2 y, simd_int2 mask) {
+ return simd_make_float2(simd_select(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_int4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_float3 simd_select(simd_float3 x, simd_float3 y, simd_int3 mask) {
+ return simd_make_float3(simd_select(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_int4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_float4 simd_select(simd_float4 x, simd_float4 y, simd_int4 mask) {
+#if defined __SSE4_1__
+ return _mm_blendv_ps(x, y, (__m128)mask);
+#else
+ return simd_bitselect(x, y, mask >> 31);
+#endif
+}
+static inline SIMD_CFUNC simd_float8 simd_select(simd_float8 x, simd_float8 y, simd_int8 mask) {
+#if defined __AVX__
+ return _mm256_blendv_ps(x, y, mask);
+#else
+ return simd_bitselect(x, y, mask >> 31);
+#endif
+}
+static inline SIMD_CFUNC simd_float16 simd_select(simd_float16 x, simd_float16 y, simd_int16 mask) {
+ return simd_bitselect(x, y, mask >> 31);
+}
+static inline SIMD_CFUNC simd_double2 simd_select(simd_double2 x, simd_double2 y, simd_long2 mask) {
+#if defined __SSE4_1__
+ return _mm_blendv_pd(x, y, (__m128d)mask);
+#else
+ return simd_bitselect(x, y, mask >> 63);
+#endif
+}
+static inline SIMD_CFUNC simd_double3 simd_select(simd_double3 x, simd_double3 y, simd_long3 mask) {
+ return simd_make_double3(simd_select(simd_make_double4_undef(x), simd_make_double4_undef(y), simd_make_long4_undef(mask)));
+}
+static inline SIMD_CFUNC simd_double4 simd_select(simd_double4 x, simd_double4 y, simd_long4 mask) {
+#if defined __AVX__
+ return _mm256_blendv_pd(x, y, mask);
+#else
+ return simd_bitselect(x, y, mask >> 63);
+#endif
+}
+static inline SIMD_CFUNC simd_double8 simd_select(simd_double8 x, simd_double8 y, simd_long8 mask) {
+ return simd_bitselect(x, y, mask >> 63);
+}
+
+static inline SIMD_CFUNC simd_char2 simd_bitselect(simd_char2 x, simd_char2 y, simd_char2 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char3 simd_bitselect(simd_char3 x, simd_char3 y, simd_char3 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char4 simd_bitselect(simd_char4 x, simd_char4 y, simd_char4 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char8 simd_bitselect(simd_char8 x, simd_char8 y, simd_char8 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char16 simd_bitselect(simd_char16 x, simd_char16 y, simd_char16 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char32 simd_bitselect(simd_char32 x, simd_char32 y, simd_char32 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_char64 simd_bitselect(simd_char64 x, simd_char64 y, simd_char64 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_uchar2 simd_bitselect(simd_uchar2 x, simd_uchar2 y, simd_char2 mask) {
+ return (simd_uchar2)simd_bitselect((simd_char2)x, (simd_char2)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar3 simd_bitselect(simd_uchar3 x, simd_uchar3 y, simd_char3 mask) {
+ return (simd_uchar3)simd_bitselect((simd_char3)x, (simd_char3)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar4 simd_bitselect(simd_uchar4 x, simd_uchar4 y, simd_char4 mask) {
+ return (simd_uchar4)simd_bitselect((simd_char4)x, (simd_char4)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar8 simd_bitselect(simd_uchar8 x, simd_uchar8 y, simd_char8 mask) {
+ return (simd_uchar8)simd_bitselect((simd_char8)x, (simd_char8)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar16 simd_bitselect(simd_uchar16 x, simd_uchar16 y, simd_char16 mask) {
+ return (simd_uchar16)simd_bitselect((simd_char16)x, (simd_char16)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar32 simd_bitselect(simd_uchar32 x, simd_uchar32 y, simd_char32 mask) {
+ return (simd_uchar32)simd_bitselect((simd_char32)x, (simd_char32)y, mask);
+}
+static inline SIMD_CFUNC simd_uchar64 simd_bitselect(simd_uchar64 x, simd_uchar64 y, simd_char64 mask) {
+ return (simd_uchar64)simd_bitselect((simd_char64)x, (simd_char64)y, mask);
+}
+static inline SIMD_CFUNC simd_short2 simd_bitselect(simd_short2 x, simd_short2 y, simd_short2 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short3 simd_bitselect(simd_short3 x, simd_short3 y, simd_short3 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short4 simd_bitselect(simd_short4 x, simd_short4 y, simd_short4 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short8 simd_bitselect(simd_short8 x, simd_short8 y, simd_short8 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short16 simd_bitselect(simd_short16 x, simd_short16 y, simd_short16 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_short32 simd_bitselect(simd_short32 x, simd_short32 y, simd_short32 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_ushort2 simd_bitselect(simd_ushort2 x, simd_ushort2 y, simd_short2 mask) {
+ return (simd_ushort2)simd_bitselect((simd_short2)x, (simd_short2)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort3 simd_bitselect(simd_ushort3 x, simd_ushort3 y, simd_short3 mask) {
+ return (simd_ushort3)simd_bitselect((simd_short3)x, (simd_short3)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort4 simd_bitselect(simd_ushort4 x, simd_ushort4 y, simd_short4 mask) {
+ return (simd_ushort4)simd_bitselect((simd_short4)x, (simd_short4)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort8 simd_bitselect(simd_ushort8 x, simd_ushort8 y, simd_short8 mask) {
+ return (simd_ushort8)simd_bitselect((simd_short8)x, (simd_short8)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort16 simd_bitselect(simd_ushort16 x, simd_ushort16 y, simd_short16 mask) {
+ return (simd_ushort16)simd_bitselect((simd_short16)x, (simd_short16)y, mask);
+}
+static inline SIMD_CFUNC simd_ushort32 simd_bitselect(simd_ushort32 x, simd_ushort32 y, simd_short32 mask) {
+ return (simd_ushort32)simd_bitselect((simd_short32)x, (simd_short32)y, mask);
+}
+static inline SIMD_CFUNC simd_int2 simd_bitselect(simd_int2 x, simd_int2 y, simd_int2 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int3 simd_bitselect(simd_int3 x, simd_int3 y, simd_int3 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int4 simd_bitselect(simd_int4 x, simd_int4 y, simd_int4 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int8 simd_bitselect(simd_int8 x, simd_int8 y, simd_int8 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_int16 simd_bitselect(simd_int16 x, simd_int16 y, simd_int16 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_uint2 simd_bitselect(simd_uint2 x, simd_uint2 y, simd_int2 mask) {
+ return (simd_uint2)simd_bitselect((simd_int2)x, (simd_int2)y, mask);
+}
+static inline SIMD_CFUNC simd_uint3 simd_bitselect(simd_uint3 x, simd_uint3 y, simd_int3 mask) {
+ return (simd_uint3)simd_bitselect((simd_int3)x, (simd_int3)y, mask);
+}
+static inline SIMD_CFUNC simd_uint4 simd_bitselect(simd_uint4 x, simd_uint4 y, simd_int4 mask) {
+ return (simd_uint4)simd_bitselect((simd_int4)x, (simd_int4)y, mask);
+}
+static inline SIMD_CFUNC simd_uint8 simd_bitselect(simd_uint8 x, simd_uint8 y, simd_int8 mask) {
+ return (simd_uint8)simd_bitselect((simd_int8)x, (simd_int8)y, mask);
+}
+static inline SIMD_CFUNC simd_uint16 simd_bitselect(simd_uint16 x, simd_uint16 y, simd_int16 mask) {
+ return (simd_uint16)simd_bitselect((simd_int16)x, (simd_int16)y, mask);
+}
+static inline SIMD_CFUNC simd_float2 simd_bitselect(simd_float2 x, simd_float2 y, simd_int2 mask) {
+ return (simd_float2)simd_bitselect((simd_int2)x, (simd_int2)y, mask);
+}
+static inline SIMD_CFUNC simd_float3 simd_bitselect(simd_float3 x, simd_float3 y, simd_int3 mask) {
+ return (simd_float3)simd_bitselect((simd_int3)x, (simd_int3)y, mask);
+}
+static inline SIMD_CFUNC simd_float4 simd_bitselect(simd_float4 x, simd_float4 y, simd_int4 mask) {
+ return (simd_float4)simd_bitselect((simd_int4)x, (simd_int4)y, mask);
+}
+static inline SIMD_CFUNC simd_float8 simd_bitselect(simd_float8 x, simd_float8 y, simd_int8 mask) {
+ return (simd_float8)simd_bitselect((simd_int8)x, (simd_int8)y, mask);
+}
+static inline SIMD_CFUNC simd_float16 simd_bitselect(simd_float16 x, simd_float16 y, simd_int16 mask) {
+ return (simd_float16)simd_bitselect((simd_int16)x, (simd_int16)y, mask);
+}
+static inline SIMD_CFUNC simd_long2 simd_bitselect(simd_long2 x, simd_long2 y, simd_long2 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long3 simd_bitselect(simd_long3 x, simd_long3 y, simd_long3 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long4 simd_bitselect(simd_long4 x, simd_long4 y, simd_long4 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_long8 simd_bitselect(simd_long8 x, simd_long8 y, simd_long8 mask) {
+ return (x & ~mask) | (y & mask);
+}
+static inline SIMD_CFUNC simd_ulong2 simd_bitselect(simd_ulong2 x, simd_ulong2 y, simd_long2 mask) {
+ return (simd_ulong2)simd_bitselect((simd_long2)x, (simd_long2)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong3 simd_bitselect(simd_ulong3 x, simd_ulong3 y, simd_long3 mask) {
+ return (simd_ulong3)simd_bitselect((simd_long3)x, (simd_long3)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong4 simd_bitselect(simd_ulong4 x, simd_ulong4 y, simd_long4 mask) {
+ return (simd_ulong4)simd_bitselect((simd_long4)x, (simd_long4)y, mask);
+}
+static inline SIMD_CFUNC simd_ulong8 simd_bitselect(simd_ulong8 x, simd_ulong8 y, simd_long8 mask) {
+ return (simd_ulong8)simd_bitselect((simd_long8)x, (simd_long8)y, mask);
+}
+static inline SIMD_CFUNC simd_double2 simd_bitselect(simd_double2 x, simd_double2 y, simd_long2 mask) {
+ return (simd_double2)simd_bitselect((simd_long2)x, (simd_long2)y, mask);
+}
+static inline SIMD_CFUNC simd_double3 simd_bitselect(simd_double3 x, simd_double3 y, simd_long3 mask) {
+ return (simd_double3)simd_bitselect((simd_long3)x, (simd_long3)y, mask);
+}
+static inline SIMD_CFUNC simd_double4 simd_bitselect(simd_double4 x, simd_double4 y, simd_long4 mask) {
+ return (simd_double4)simd_bitselect((simd_long4)x, (simd_long4)y, mask);
+}
+static inline SIMD_CFUNC simd_double8 simd_bitselect(simd_double8 x, simd_double8 y, simd_long8 mask) {
+ return (simd_double8)simd_bitselect((simd_long8)x, (simd_long8)y, mask);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* __SIMD_LOGIC_HEADER__ */ \ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/math.h b/lib/libc/include/aarch64-macos-gnu/simd/math.h
new file mode 100644
index 0000000000..4d5c654f69
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/math.h
@@ -0,0 +1,5380 @@
+/*! @header
+ * The interfaces declared in this header provide elementwise math operations
+ * on vectors; each lane of the result vector depends only on the data in the
+ * corresponding lane of the argument(s) to the function.
+ *
+ * You should not use the C functions declared in this header directly (these
+ * are functions with names like `__tg_cos(x)`). These are merely
+ * implementation details of <tgmath.h> overloading; instead of calling
+ * `__tg_cos(x)`, call `cos(x)`. If you are writing C++, use `simd::cos(x)`.
+ *
+ * Note that while these vector functions are relatively recent additions,
+ * scalar fallback is provided for all of them, so they are available even
+ * when targeting older OS versions.
+ *
+ * The following functions are available:
+ *
+ * C name C++ name Notes
+ * ----------------------------------------------------------------------
+ * acos(x) simd::acos(x)
+ * asin(x) simd::asin(x)
+ * atan(x) simd::atan(x)
+ * atan2(y,x) simd::atan2(y,x) The argument order matches the scalar
+ * atan2 function, which gives the angle
+ * of a line with slope y/x.
+ * cos(x) simd::cos(x)
+ * sin(x) simd::sin(x)
+ * tan(x) simd::tan(x)
+ *
+ * cospi(x) simd::cospi(x) Returns cos(pi*x), sin(pi*x), tan(pi*x)
+ * sinpi(x) simd::sinpi(x) more efficiently and accurately than
+ * tanpi(x) simd::tanpi(x) would otherwise be possible
+ *
+ * acosh(x) simd::acosh(x)
+ * asinh(x) simd::asinh(x)
+ * atanh(x) simd::atanh(x)
+ *
+ * cosh(x) simd::cosh(x)
+ * sinh(x) simd::sinh(x)
+ * tanh(x) simd::tanh(x)
+ *
+ * exp(x) simd::exp(x)
+ * exp2(x) simd::exp2(x)
+ * exp10(x) simd::exp10(x) More efficient that pow(10,x).
+ * expm1(x) simd::expm1(x) exp(x)-1, accurate even for tiny x.
+ *
+ * log(x) simd::log(x)
+ * log2(x) simd::log2(x)
+ * log10(x) simd::log10(x)
+ * log1p(x) simd::log1p(x) log(1+x), accurate even for tiny x.
+ *
+ * fabs(x) simd::fabs(x)
+ * cbrt(x) simd::cbrt(x)
+ * sqrt(x) simd::sqrt(x)
+ * pow(x,y) simd::pow(x,y)
+ * copysign(x,y) simd::copysign(x,y)
+ * hypot(x,y) simd::hypot(x,y) sqrt(x*x + y*y), computed without
+ * overflow.1
+ * erf(x) simd::erf(x)
+ * erfc(x) simd::erfc(x)
+ * tgamma(x) simd::tgamma(x)
+ *
+ * fmod(x,y) simd::fmod(x,y)
+ * remainder(x,y) simd::remainder(x,y)
+ *
+ * ceil(x) simd::ceil(x)
+ * floor(x) simd::floor(x)
+ * rint(x) simd::rint(x)
+ * round(x) simd::round(x)
+ * trunc(x) simd::trunc(x)
+ *
+ * fdim(x,y) simd::fdim(x,y)
+ * fmax(x,y) simd::fmax(x,y) When one argument to fmin or fmax is
+ * fmin(x,y) simd::fmin(x,y) constant, use it as the *second* (y)
+ * argument to get better codegen on some
+ * architectures. E.g., write fmin(x,2)
+ * instead of fmin(2,x).
+ * fma(x,y,z) simd::fma(x,y,z) Fast on arm64 and when targeting AVX2
+ * and later; may be quite expensive on
+ * older hardware.
+ * simd_muladd(x,y,z) simd::muladd(x,y,z)
+ *
+ * @copyright 2014-2017 Apple, Inc. All rights reserved.
+ * @unsorted */
+
+#ifndef SIMD_MATH_HEADER
+#define SIMD_MATH_HEADER
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector_make.h>
+#include <simd/logic.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_acos(simd_float2 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_acos(simd_float3 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_acos(simd_double3 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x);
+/*! @abstract Do not call this function; instead use `acos` in C and
+ * Objective-C, and `simd::acos` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_asin(simd_float2 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_asin(simd_float3 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_asin(simd_double3 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x);
+/*! @abstract Do not call this function; instead use `asin` in C and
+ * Objective-C, and `simd::asin` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_atan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_atan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_atan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `atan` in C and
+ * Objective-C, and `simd::atan` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_cos(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_cos(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_cos(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cos` in C and
+ * Objective-C, and `simd::cos` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_sin(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_sin(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_sin(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sin` in C and
+ * Objective-C, and `simd::sin` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_tan(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_tan(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_tan(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tan` in C and
+ * Objective-C, and `simd::tan` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_cospi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_cospi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_cospi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cospi` in C and
+ * Objective-C, and `simd::cospi` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x);
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_sinpi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_sinpi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_sinpi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sinpi` in C and
+ * Objective-C, and `simd::sinpi` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x);
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_tanpi(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_tanpi(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_tanpi(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tanpi` in C and
+ * Objective-C, and `simd::tanpi` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x);
+#endif
+
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_acosh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_acosh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_acosh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `acosh` in C and
+ * Objective-C, and `simd::acosh` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_asinh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_asinh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_asinh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `asinh` in C and
+ * Objective-C, and `simd::asinh` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_atanh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_atanh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_atanh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `atanh` in C and
+ * Objective-C, and `simd::atanh` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_cosh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_cosh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_cosh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cosh` in C and
+ * Objective-C, and `simd::cosh` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_sinh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_sinh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_sinh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sinh` in C and
+ * Objective-C, and `simd::sinh` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_tanh(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_tanh(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_tanh(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tanh` in C and
+ * Objective-C, and `simd::tanh` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_exp(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_exp(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_exp(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp` in C and
+ * Objective-C, and `simd::exp` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_exp2(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_exp2(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_exp2(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp2` in C and
+ * Objective-C, and `simd::exp2` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x);
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_exp10(simd_float2 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_exp10(simd_float3 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_exp10(simd_double3 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x);
+/*! @abstract Do not call this function; instead use `exp10` in C and
+ * Objective-C, and `simd::exp10` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x);
+#endif
+
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_expm1(simd_float2 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_expm1(simd_float3 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_expm1(simd_double3 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x);
+/*! @abstract Do not call this function; instead use `expm1` in C and
+ * Objective-C, and `simd::expm1` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_log(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_log(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_log(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log` in C and
+ * Objective-C, and `simd::log` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_log2(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_log2(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_log2(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log2` in C and
+ * Objective-C, and `simd::log2` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_log10(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_log10(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_log10(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log10` in C and
+ * Objective-C, and `simd::log10` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_log1p(simd_float2 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_log1p(simd_float3 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_log1p(simd_double3 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x);
+/*! @abstract Do not call this function; instead use `log1p` in C and
+ * Objective-C, and `simd::log1p` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_fabs(simd_float2 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_fabs(simd_float3 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_fabs(simd_float4 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_fabs(simd_float8 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_fabs(simd_float16 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_fabs(simd_double2 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_fabs(simd_double3 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_fabs(simd_double4 x);
+/*! @abstract Do not call this function; instead use `fabs` in C and
+ * Objective-C, and `simd::fabs` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_fabs(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_cbrt(simd_float2 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_cbrt(simd_float3 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_cbrt(simd_double3 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x);
+/*! @abstract Do not call this function; instead use `cbrt` in C and
+ * Objective-C, and `simd::cbrt` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_sqrt(simd_float2 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_sqrt(simd_float3 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_sqrt(simd_float4 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_sqrt(simd_float8 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_sqrt(simd_float16 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_sqrt(simd_double2 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_sqrt(simd_double3 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_sqrt(simd_double4 x);
+/*! @abstract Do not call this function; instead use `sqrt` in C and
+ * Objective-C, and `simd::sqrt` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_sqrt(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_erf(simd_float2 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_erf(simd_float3 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_erf(simd_double3 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x);
+/*! @abstract Do not call this function; instead use `erf` in C and
+ * Objective-C, and `simd::erf` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_erfc(simd_float2 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_erfc(simd_float3 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_erfc(simd_double3 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x);
+/*! @abstract Do not call this function; instead use `erfc` in C and
+ * Objective-C, and `simd::erfc` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_tgamma(simd_float2 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_tgamma(simd_float3 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_tgamma(simd_double3 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x);
+/*! @abstract Do not call this function; instead use `tgamma` in C and
+ * Objective-C, and `simd::tgamma` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_ceil(simd_float2 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_ceil(simd_float3 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_ceil(simd_float4 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_ceil(simd_float8 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_ceil(simd_float16 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_ceil(simd_double2 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_ceil(simd_double3 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_ceil(simd_double4 x);
+/*! @abstract Do not call this function; instead use `ceil` in C and
+ * Objective-C, and `simd::ceil` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_ceil(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_floor(simd_float2 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_floor(simd_float3 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_floor(simd_float4 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_floor(simd_float8 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_floor(simd_float16 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_floor(simd_double2 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_floor(simd_double3 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_floor(simd_double4 x);
+/*! @abstract Do not call this function; instead use `floor` in C and
+ * Objective-C, and `simd::floor` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_floor(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_rint(simd_float2 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_rint(simd_float3 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_rint(simd_float4 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_rint(simd_float8 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_rint(simd_float16 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_rint(simd_double2 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_rint(simd_double3 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_rint(simd_double4 x);
+/*! @abstract Do not call this function; instead use `rint` in C and
+ * Objective-C, and `simd::rint` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_rint(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_round(simd_float2 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_round(simd_float3 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_round(simd_double3 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x);
+/*! @abstract Do not call this function; instead use `round` in C and
+ * Objective-C, and `simd::round` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_trunc(simd_float2 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_trunc(simd_float3 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_trunc(simd_float4 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_trunc(simd_float8 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_trunc(simd_float16 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_trunc(simd_double2 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_trunc(simd_double3 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_trunc(simd_double4 x);
+/*! @abstract Do not call this function; instead use `trunc` in C and
+ * Objective-C, and `simd::trunc` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_trunc(simd_double8 x);
+
+
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_atan2(simd_float2 y, simd_float2 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_atan2(simd_float3 y, simd_float3 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_atan2(simd_double3 y, simd_double3 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x);
+/*! @abstract Do not call this function; instead use `atan2` in C and
+ * Objective-C, and `simd::atan2` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x);
+
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_hypot(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_hypot(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_hypot(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `hypot` in C and
+ * Objective-C, and `simd::hypot` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_pow(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_pow(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_pow(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `pow` in C and
+ * Objective-C, and `simd::pow` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_fmod(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_fmod(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_fmod(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmod` in C and
+ * Objective-C, and `simd::fmod` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_remainder(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_remainder(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_remainder(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `remainder` in C and
+ * Objective-C, and `simd::remainder` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_copysign(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_copysign(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_copysign(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_copysign(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_copysign(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_copysign(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_copysign(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_copysign(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `copysign` in C and
+ * Objective-C, and `simd::copysign` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_copysign(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_nextafter(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_nextafter(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_nextafter(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `nextafter` in C and
+ * Objective-C, and `simd::nextafter` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_fdim(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_fdim(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_fdim(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_fdim(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_fdim(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_fdim(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_fdim(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_fdim(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fdim` in C and
+ * Objective-C, and `simd::fdim` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_fdim(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_fmax(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_fmax(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_fmax(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_fmax(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_fmax(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_fmax(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_fmax(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_fmax(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmax` in C and
+ * Objective-C, and `simd::fmax` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_fmax(simd_double8 x, simd_double8 y);
+
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_fmin(simd_float2 x, simd_float2 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_fmin(simd_float3 x, simd_float3 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_fmin(simd_float4 x, simd_float4 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_fmin(simd_float8 x, simd_float8 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_fmin(simd_float16 x, simd_float16 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_fmin(simd_double2 x, simd_double2 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_fmin(simd_double3 x, simd_double3 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_fmin(simd_double4 x, simd_double4 y);
+/*! @abstract Do not call this function; instead use `fmin` in C and
+ * Objective-C, and `simd::fmin` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_fmin(simd_double8 x, simd_double8 y);
+
+
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_float2 __tg_fma(simd_float2 x, simd_float2 y, simd_float2 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_float3 __tg_fma(simd_float3 x, simd_float3 y, simd_float3 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_float4 __tg_fma(simd_float4 x, simd_float4 y, simd_float4 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_float8 __tg_fma(simd_float8 x, simd_float8 y, simd_float8 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_float16 __tg_fma(simd_float16 x, simd_float16 y, simd_float16 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_double2 __tg_fma(simd_double2 x, simd_double2 y, simd_double2 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_double3 __tg_fma(simd_double3 x, simd_double3 y, simd_double3 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_double4 __tg_fma(simd_double4 x, simd_double4 y, simd_double4 z);
+/*! @abstract Do not call this function; instead use `fma` in C and Objective-C,
+ * and `simd::fma` in C++. */
+static inline SIMD_CFUNC simd_double8 __tg_fma(simd_double8 x, simd_double8 y, simd_double8 z);
+
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC float simd_muladd(float x, float y, float z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_float2 simd_muladd(simd_float2 x, simd_float2 y, simd_float2 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_float3 simd_muladd(simd_float3 x, simd_float3 y, simd_float3 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_float4 simd_muladd(simd_float4 x, simd_float4 y, simd_float4 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_float8 simd_muladd(simd_float8 x, simd_float8 y, simd_float8 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_float16 simd_muladd(simd_float16 x, simd_float16 y, simd_float16 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC double simd_muladd(double x, double y, double z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_double2 simd_muladd(simd_double2 x, simd_double2 y, simd_double2 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_double3 simd_muladd(simd_double3 x, simd_double3 y, simd_double3 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_double4 simd_muladd(simd_double4 x, simd_double4 y, simd_double4 z);
+/*! @abstract Computes accum + x*y by the most efficient means available;
+ * either a fused multiply add or separate multiply and add instructions. */
+static inline SIMD_CFUNC simd_double8 simd_muladd(simd_double8 x, simd_double8 y, simd_double8 z);
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+#include <cmath>
+/*! @abstract Do not call this function directly; use simd::acos instead. */
+static SIMD_CPPFUNC float __tg_acos(float x) { return ::acos(x); }
+/*! @abstract Do not call this function directly; use simd::acos instead. */
+static SIMD_CPPFUNC double __tg_acos(double x) { return ::acos(x); }
+/*! @abstract Do not call this function directly; use simd::asin instead. */
+static SIMD_CPPFUNC float __tg_asin(float x) { return ::asin(x); }
+/*! @abstract Do not call this function directly; use simd::asin instead. */
+static SIMD_CPPFUNC double __tg_asin(double x) { return ::asin(x); }
+/*! @abstract Do not call this function directly; use simd::atan instead. */
+static SIMD_CPPFUNC float __tg_atan(float x) { return ::atan(x); }
+/*! @abstract Do not call this function directly; use simd::atan instead. */
+static SIMD_CPPFUNC double __tg_atan(double x) { return ::atan(x); }
+/*! @abstract Do not call this function directly; use simd::cos instead. */
+static SIMD_CPPFUNC float __tg_cos(float x) { return ::cos(x); }
+/*! @abstract Do not call this function directly; use simd::cos instead. */
+static SIMD_CPPFUNC double __tg_cos(double x) { return ::cos(x); }
+/*! @abstract Do not call this function directly; use simd::sin instead. */
+static SIMD_CPPFUNC float __tg_sin(float x) { return ::sin(x); }
+/*! @abstract Do not call this function directly; use simd::sin instead. */
+static SIMD_CPPFUNC double __tg_sin(double x) { return ::sin(x); }
+/*! @abstract Do not call this function directly; use simd::tan instead. */
+static SIMD_CPPFUNC float __tg_tan(float x) { return ::tan(x); }
+/*! @abstract Do not call this function directly; use simd::tan instead. */
+static SIMD_CPPFUNC double __tg_tan(double x) { return ::tan(x); }
+/*! @abstract Do not call this function directly; use simd::cospi instead. */
+static SIMD_CPPFUNC float __tg_cospi(float x) { return ::__cospi(x); }
+/*! @abstract Do not call this function directly; use simd::cospi instead. */
+static SIMD_CPPFUNC double __tg_cospi(double x) { return ::__cospi(x); }
+/*! @abstract Do not call this function directly; use simd::sinpi instead. */
+static SIMD_CPPFUNC float __tg_sinpi(float x) { return ::__sinpi(x); }
+/*! @abstract Do not call this function directly; use simd::sinpi instead. */
+static SIMD_CPPFUNC double __tg_sinpi(double x) { return ::__sinpi(x); }
+/*! @abstract Do not call this function directly; use simd::tanpi instead. */
+static SIMD_CPPFUNC float __tg_tanpi(float x) { return ::__tanpi(x); }
+/*! @abstract Do not call this function directly; use simd::tanpi instead. */
+static SIMD_CPPFUNC double __tg_tanpi(double x) { return ::__tanpi(x); }
+/*! @abstract Do not call this function directly; use simd::acosh instead. */
+static SIMD_CPPFUNC float __tg_acosh(float x) { return ::acosh(x); }
+/*! @abstract Do not call this function directly; use simd::acosh instead. */
+static SIMD_CPPFUNC double __tg_acosh(double x) { return ::acosh(x); }
+/*! @abstract Do not call this function directly; use simd::asinh instead. */
+static SIMD_CPPFUNC float __tg_asinh(float x) { return ::asinh(x); }
+/*! @abstract Do not call this function directly; use simd::asinh instead. */
+static SIMD_CPPFUNC double __tg_asinh(double x) { return ::asinh(x); }
+/*! @abstract Do not call this function directly; use simd::atanh instead. */
+static SIMD_CPPFUNC float __tg_atanh(float x) { return ::atanh(x); }
+/*! @abstract Do not call this function directly; use simd::atanh instead. */
+static SIMD_CPPFUNC double __tg_atanh(double x) { return ::atanh(x); }
+/*! @abstract Do not call this function directly; use simd::cosh instead. */
+static SIMD_CPPFUNC float __tg_cosh(float x) { return ::cosh(x); }
+/*! @abstract Do not call this function directly; use simd::cosh instead. */
+static SIMD_CPPFUNC double __tg_cosh(double x) { return ::cosh(x); }
+/*! @abstract Do not call this function directly; use simd::sinh instead. */
+static SIMD_CPPFUNC float __tg_sinh(float x) { return ::sinh(x); }
+/*! @abstract Do not call this function directly; use simd::sinh instead. */
+static SIMD_CPPFUNC double __tg_sinh(double x) { return ::sinh(x); }
+/*! @abstract Do not call this function directly; use simd::tanh instead. */
+static SIMD_CPPFUNC float __tg_tanh(float x) { return ::tanh(x); }
+/*! @abstract Do not call this function directly; use simd::tanh instead. */
+static SIMD_CPPFUNC double __tg_tanh(double x) { return ::tanh(x); }
+/*! @abstract Do not call this function directly; use simd::exp instead. */
+static SIMD_CPPFUNC float __tg_exp(float x) { return ::exp(x); }
+/*! @abstract Do not call this function directly; use simd::exp instead. */
+static SIMD_CPPFUNC double __tg_exp(double x) { return ::exp(x); }
+/*! @abstract Do not call this function directly; use simd::exp2 instead. */
+static SIMD_CPPFUNC float __tg_exp2(float x) { return ::exp2(x); }
+/*! @abstract Do not call this function directly; use simd::exp2 instead. */
+static SIMD_CPPFUNC double __tg_exp2(double x) { return ::exp2(x); }
+/*! @abstract Do not call this function directly; use simd::exp10 instead. */
+static SIMD_CPPFUNC float __tg_exp10(float x) { return ::__exp10(x); }
+/*! @abstract Do not call this function directly; use simd::exp10 instead. */
+static SIMD_CPPFUNC double __tg_exp10(double x) { return ::__exp10(x); }
+/*! @abstract Do not call this function directly; use simd::expm1 instead. */
+static SIMD_CPPFUNC float __tg_expm1(float x) { return ::expm1(x); }
+/*! @abstract Do not call this function directly; use simd::expm1 instead. */
+static SIMD_CPPFUNC double __tg_expm1(double x) { return ::expm1(x); }
+/*! @abstract Do not call this function directly; use simd::log instead. */
+static SIMD_CPPFUNC float __tg_log(float x) { return ::log(x); }
+/*! @abstract Do not call this function directly; use simd::log instead. */
+static SIMD_CPPFUNC double __tg_log(double x) { return ::log(x); }
+/*! @abstract Do not call this function directly; use simd::log2 instead. */
+static SIMD_CPPFUNC float __tg_log2(float x) { return ::log2(x); }
+/*! @abstract Do not call this function directly; use simd::log2 instead. */
+static SIMD_CPPFUNC double __tg_log2(double x) { return ::log2(x); }
+/*! @abstract Do not call this function directly; use simd::log10 instead. */
+static SIMD_CPPFUNC float __tg_log10(float x) { return ::log10(x); }
+/*! @abstract Do not call this function directly; use simd::log10 instead. */
+static SIMD_CPPFUNC double __tg_log10(double x) { return ::log10(x); }
+/*! @abstract Do not call this function directly; use simd::log1p instead. */
+static SIMD_CPPFUNC float __tg_log1p(float x) { return ::log1p(x); }
+/*! @abstract Do not call this function directly; use simd::log1p instead. */
+static SIMD_CPPFUNC double __tg_log1p(double x) { return ::log1p(x); }
+/*! @abstract Do not call this function directly; use simd::fabs instead. */
+static SIMD_CPPFUNC float __tg_fabs(float x) { return ::fabs(x); }
+/*! @abstract Do not call this function directly; use simd::fabs instead. */
+static SIMD_CPPFUNC double __tg_fabs(double x) { return ::fabs(x); }
+/*! @abstract Do not call this function directly; use simd::cbrt instead. */
+static SIMD_CPPFUNC float __tg_cbrt(float x) { return ::cbrt(x); }
+/*! @abstract Do not call this function directly; use simd::cbrt instead. */
+static SIMD_CPPFUNC double __tg_cbrt(double x) { return ::cbrt(x); }
+/*! @abstract Do not call this function directly; use simd::sqrt instead. */
+static SIMD_CPPFUNC float __tg_sqrt(float x) { return ::sqrt(x); }
+/*! @abstract Do not call this function directly; use simd::sqrt instead. */
+static SIMD_CPPFUNC double __tg_sqrt(double x) { return ::sqrt(x); }
+/*! @abstract Do not call this function directly; use simd::erf instead. */
+static SIMD_CPPFUNC float __tg_erf(float x) { return ::erf(x); }
+/*! @abstract Do not call this function directly; use simd::erf instead. */
+static SIMD_CPPFUNC double __tg_erf(double x) { return ::erf(x); }
+/*! @abstract Do not call this function directly; use simd::erfc instead. */
+static SIMD_CPPFUNC float __tg_erfc(float x) { return ::erfc(x); }
+/*! @abstract Do not call this function directly; use simd::erfc instead. */
+static SIMD_CPPFUNC double __tg_erfc(double x) { return ::erfc(x); }
+/*! @abstract Do not call this function directly; use simd::tgamma instead. */
+static SIMD_CPPFUNC float __tg_tgamma(float x) { return ::tgamma(x); }
+/*! @abstract Do not call this function directly; use simd::tgamma instead. */
+static SIMD_CPPFUNC double __tg_tgamma(double x) { return ::tgamma(x); }
+/*! @abstract Do not call this function directly; use simd::ceil instead. */
+static SIMD_CPPFUNC float __tg_ceil(float x) { return ::ceil(x); }
+/*! @abstract Do not call this function directly; use simd::ceil instead. */
+static SIMD_CPPFUNC double __tg_ceil(double x) { return ::ceil(x); }
+/*! @abstract Do not call this function directly; use simd::floor instead. */
+static SIMD_CPPFUNC float __tg_floor(float x) { return ::floor(x); }
+/*! @abstract Do not call this function directly; use simd::floor instead. */
+static SIMD_CPPFUNC double __tg_floor(double x) { return ::floor(x); }
+/*! @abstract Do not call this function directly; use simd::rint instead. */
+static SIMD_CPPFUNC float __tg_rint(float x) { return ::rint(x); }
+/*! @abstract Do not call this function directly; use simd::rint instead. */
+static SIMD_CPPFUNC double __tg_rint(double x) { return ::rint(x); }
+/*! @abstract Do not call this function directly; use simd::round instead. */
+static SIMD_CPPFUNC float __tg_round(float x) { return ::round(x); }
+/*! @abstract Do not call this function directly; use simd::round instead. */
+static SIMD_CPPFUNC double __tg_round(double x) { return ::round(x); }
+/*! @abstract Do not call this function directly; use simd::trunc instead. */
+static SIMD_CPPFUNC float __tg_trunc(float x) { return ::trunc(x); }
+/*! @abstract Do not call this function directly; use simd::trunc instead. */
+static SIMD_CPPFUNC double __tg_trunc(double x) { return ::trunc(x); }
+/*! @abstract Do not call this function directly; use simd::atan2 instead. */
+static SIMD_CPPFUNC float __tg_atan2(float x, float y) { return ::atan2(x, y); }
+/*! @abstract Do not call this function directly; use simd::atan2 instead. */
+static SIMD_CPPFUNC double __tg_atan2(double x, float y) { return ::atan2(x, y); }
+/*! @abstract Do not call this function directly; use simd::hypot instead. */
+static SIMD_CPPFUNC float __tg_hypot(float x, float y) { return ::hypot(x, y); }
+/*! @abstract Do not call this function directly; use simd::hypot instead. */
+static SIMD_CPPFUNC double __tg_hypot(double x, float y) { return ::hypot(x, y); }
+/*! @abstract Do not call this function directly; use simd::pow instead. */
+static SIMD_CPPFUNC float __tg_pow(float x, float y) { return ::pow(x, y); }
+/*! @abstract Do not call this function directly; use simd::pow instead. */
+static SIMD_CPPFUNC double __tg_pow(double x, float y) { return ::pow(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmod instead. */
+static SIMD_CPPFUNC float __tg_fmod(float x, float y) { return ::fmod(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmod instead. */
+static SIMD_CPPFUNC double __tg_fmod(double x, float y) { return ::fmod(x, y); }
+/*! @abstract Do not call this function directly; use simd::remainder
+ * instead. */
+static SIMD_CPPFUNC float __tg_remainder(float x, float y) { return ::remainder(x, y); }
+/*! @abstract Do not call this function directly; use simd::remainder
+ * instead. */
+static SIMD_CPPFUNC double __tg_remainder(double x, float y) { return ::remainder(x, y); }
+/*! @abstract Do not call this function directly; use simd::copysign
+ * instead. */
+static SIMD_CPPFUNC float __tg_copysign(float x, float y) { return ::copysign(x, y); }
+/*! @abstract Do not call this function directly; use simd::copysign
+ * instead. */
+static SIMD_CPPFUNC double __tg_copysign(double x, float y) { return ::copysign(x, y); }
+/*! @abstract Do not call this function directly; use simd::nextafter
+ * instead. */
+static SIMD_CPPFUNC float __tg_nextafter(float x, float y) { return ::nextafter(x, y); }
+/*! @abstract Do not call this function directly; use simd::nextafter
+ * instead. */
+static SIMD_CPPFUNC double __tg_nextafter(double x, float y) { return ::nextafter(x, y); }
+/*! @abstract Do not call this function directly; use simd::fdim instead. */
+static SIMD_CPPFUNC float __tg_fdim(float x, float y) { return ::fdim(x, y); }
+/*! @abstract Do not call this function directly; use simd::fdim instead. */
+static SIMD_CPPFUNC double __tg_fdim(double x, float y) { return ::fdim(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmax instead. */
+static SIMD_CPPFUNC float __tg_fmax(float x, float y) { return ::fmax(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmax instead. */
+static SIMD_CPPFUNC double __tg_fmax(double x, float y) { return ::fmax(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmin instead. */
+static SIMD_CPPFUNC float __tg_fmin(float x, float y) { return ::fmin(x, y); }
+/*! @abstract Do not call this function directly; use simd::fmin instead. */
+static SIMD_CPPFUNC double __tg_fmin(double x, float y) { return ::fmin(x, y); }
+/*! @abstract Do not call this function directly; use simd::fma instead. */
+static SIMD_CPPFUNC float __tg_fma(float x, float y, float z) { return ::fma(x, y, z); }
+/*! @abstract Do not call this function directly; use simd::fma instead. */
+static SIMD_CPPFUNC double __tg_fma(double x, double y, double z) { return ::fma(x, y, z); }
+
+namespace simd {
+/*! @abstract Generalizes the <cmath> function acos to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN acos(fptypeN x) { return ::__tg_acos(x); }
+
+/*! @abstract Generalizes the <cmath> function asin to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN asin(fptypeN x) { return ::__tg_asin(x); }
+
+/*! @abstract Generalizes the <cmath> function atan to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN atan(fptypeN x) { return ::__tg_atan(x); }
+
+/*! @abstract Generalizes the <cmath> function cos to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN cos(fptypeN x) { return ::__tg_cos(x); }
+
+/*! @abstract Generalizes the <cmath> function sin to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN sin(fptypeN x) { return ::__tg_sin(x); }
+
+/*! @abstract Generalizes the <cmath> function tan to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN tan(fptypeN x) { return ::__tg_tan(x); }
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function cospi to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN cospi(fptypeN x) { return ::__tg_cospi(x); }
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function sinpi to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN sinpi(fptypeN x) { return ::__tg_sinpi(x); }
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function tanpi to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN tanpi(fptypeN x) { return ::__tg_tanpi(x); }
+#endif
+
+/*! @abstract Generalizes the <cmath> function acosh to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN acosh(fptypeN x) { return ::__tg_acosh(x); }
+
+/*! @abstract Generalizes the <cmath> function asinh to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN asinh(fptypeN x) { return ::__tg_asinh(x); }
+
+/*! @abstract Generalizes the <cmath> function atanh to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN atanh(fptypeN x) { return ::__tg_atanh(x); }
+
+/*! @abstract Generalizes the <cmath> function cosh to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN cosh(fptypeN x) { return ::__tg_cosh(x); }
+
+/*! @abstract Generalizes the <cmath> function sinh to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN sinh(fptypeN x) { return ::__tg_sinh(x); }
+
+/*! @abstract Generalizes the <cmath> function tanh to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN tanh(fptypeN x) { return ::__tg_tanh(x); }
+
+/*! @abstract Generalizes the <cmath> function exp to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN exp(fptypeN x) { return ::__tg_exp(x); }
+
+/*! @abstract Generalizes the <cmath> function exp2 to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN exp2(fptypeN x) { return ::__tg_exp2(x); }
+
+#if SIMD_LIBRARY_VERSION >= 1
+/*! @abstract Generalizes the <cmath> function exp10 to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN exp10(fptypeN x) { return ::__tg_exp10(x); }
+#endif
+
+/*! @abstract Generalizes the <cmath> function expm1 to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN expm1(fptypeN x) { return ::__tg_expm1(x); }
+
+/*! @abstract Generalizes the <cmath> function log to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN log(fptypeN x) { return ::__tg_log(x); }
+
+/*! @abstract Generalizes the <cmath> function log2 to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN log2(fptypeN x) { return ::__tg_log2(x); }
+
+/*! @abstract Generalizes the <cmath> function log10 to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN log10(fptypeN x) { return ::__tg_log10(x); }
+
+/*! @abstract Generalizes the <cmath> function log1p to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN log1p(fptypeN x) { return ::__tg_log1p(x); }
+
+/*! @abstract Generalizes the <cmath> function fabs to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN fabs(fptypeN x) { return ::__tg_fabs(x); }
+
+/*! @abstract Generalizes the <cmath> function cbrt to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN cbrt(fptypeN x) { return ::__tg_cbrt(x); }
+
+/*! @abstract Generalizes the <cmath> function sqrt to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN sqrt(fptypeN x) { return ::__tg_sqrt(x); }
+
+/*! @abstract Generalizes the <cmath> function erf to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN erf(fptypeN x) { return ::__tg_erf(x); }
+
+/*! @abstract Generalizes the <cmath> function erfc to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN erfc(fptypeN x) { return ::__tg_erfc(x); }
+
+/*! @abstract Generalizes the <cmath> function tgamma to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN tgamma(fptypeN x) { return ::__tg_tgamma(x); }
+
+/*! @abstract Generalizes the <cmath> function ceil to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN ceil(fptypeN x) { return ::__tg_ceil(x); }
+
+/*! @abstract Generalizes the <cmath> function floor to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN floor(fptypeN x) { return ::__tg_floor(x); }
+
+/*! @abstract Generalizes the <cmath> function rint to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN rint(fptypeN x) { return ::__tg_rint(x); }
+
+/*! @abstract Generalizes the <cmath> function round to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN round(fptypeN x) { return ::__tg_round(x); }
+
+/*! @abstract Generalizes the <cmath> function trunc to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN trunc(fptypeN x) { return ::__tg_trunc(x); }
+
+/*! @abstract Generalizes the <cmath> function atan2 to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN atan2(fptypeN y, fptypeN x) { return ::__tg_atan2(y, x); }
+
+/*! @abstract Generalizes the <cmath> function hypot to operate on vectors
+ * of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN hypot(fptypeN x, fptypeN y) { return ::__tg_hypot(x, y); }
+
+/*! @abstract Generalizes the <cmath> function pow to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN pow(fptypeN x, fptypeN y) { return ::__tg_pow(x, y); }
+
+/*! @abstract Generalizes the <cmath> function fmod to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN fmod(fptypeN x, fptypeN y) { return ::__tg_fmod(x, y); }
+
+/*! @abstract Generalizes the <cmath> function remainder to operate on
+ * vectors of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN remainder(fptypeN x, fptypeN y) { return ::__tg_remainder(x, y); }
+
+/*! @abstract Generalizes the <cmath> function copysign to operate on
+ * vectors of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN copysign(fptypeN x, fptypeN y) { return ::__tg_copysign(x, y); }
+
+/*! @abstract Generalizes the <cmath> function nextafter to operate on
+ * vectors of floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN nextafter(fptypeN x, fptypeN y) { return ::__tg_nextafter(x, y); }
+
+/*! @abstract Generalizes the <cmath> function fdim to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN fdim(fptypeN x, fptypeN y) { return ::__tg_fdim(x, y); }
+
+/*! @abstract Generalizes the <cmath> function fmax to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN fmax(fptypeN x, fptypeN y) { return ::__tg_fmax(x, y); }
+
+/*! @abstract Generalizes the <cmath> function fmin to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN fmin(fptypeN x, fptypeN y) { return ::__tg_fmin(x, y); }
+
+/*! @abstract Generalizes the <cmath> function fma to operate on vectors of
+ * floats and doubles. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN fma(fptypeN x, fptypeN y, fptypeN z) { return ::__tg_fma(x, y, z); }
+
+/*! @abstract Computes x*y + z by the most efficient means available; either
+ * a fused multiply add or separate multiply and add. */
+ template <typename fptypeN>
+ static SIMD_CPPFUNC fptypeN muladd(fptypeN x, fptypeN y, fptypeN z) { return ::simd_muladd(x, y, z); }
+};
+
+extern "C" {
+#else
+#include <tgmath.h>
+/* C and Objective-C, we need some infrastructure to piggyback on tgmath.h */
+static SIMD_OVERLOAD simd_float2 __tg_promote(simd_float2);
+static SIMD_OVERLOAD simd_float3 __tg_promote(simd_float3);
+static SIMD_OVERLOAD simd_float4 __tg_promote(simd_float4);
+static SIMD_OVERLOAD simd_float8 __tg_promote(simd_float8);
+static SIMD_OVERLOAD simd_float16 __tg_promote(simd_float16);
+static SIMD_OVERLOAD simd_double2 __tg_promote(simd_double2);
+static SIMD_OVERLOAD simd_double3 __tg_promote(simd_double3);
+static SIMD_OVERLOAD simd_double4 __tg_promote(simd_double4);
+static SIMD_OVERLOAD simd_double8 __tg_promote(simd_double8);
+
+/* Apple extensions to <math.h>, added in macOS 10.9 and iOS 7.0 */
+#if __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_9 || \
+ __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_7_0 || \
+ __DRIVERKIT_VERSION_MIN_REQUIRED >= __DRIVERKIT_19_0
+static inline SIMD_CFUNC float __tg_cospi(float x) { return __cospif(x); }
+static inline SIMD_CFUNC double __tg_cospi(double x) { return __cospi(x); }
+#undef cospi
+/*! @abstract `cospi(x)` computes `cos(pi * x)` without intermediate rounding.
+ *
+ * @discussion Both faster and more accurate than multiplying by `pi` and then
+ * calling `cos`. Defined for `float` and `double` as well as vectors of
+ * floats and doubles as provided by `<simd/simd.h>`. */
+#define cospi(__x) __tg_cospi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_sinpi(float x) { return __sinpif(x); }
+static inline SIMD_CFUNC double __tg_sinpi(double x) { return __sinpi(x); }
+#undef sinpi
+/*! @abstract `sinpi(x)` computes `sin(pi * x)` without intermediate rounding.
+ *
+ * @discussion Both faster and more accurate than multiplying by `pi` and then
+ * calling `sin`. Defined for `float` and `double` as well as vectors
+ * of floats and doubles as provided by `<simd/simd.h>`. */
+#define sinpi(__x) __tg_sinpi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_tanpi(float x) { return __tanpif(x); }
+static inline SIMD_CFUNC double __tg_tanpi(double x) { return __tanpi(x); }
+#undef tanpi
+/*! @abstract `tanpi(x)` computes `tan(pi * x)` without intermediate rounding.
+ *
+ * @discussion Both faster and more accurate than multiplying by `pi` and then
+ * calling `tan`. Defined for `float` and `double` as well as vectors of
+ * floats and doubles as provided by `<simd/simd.h>`. */
+#define tanpi(__x) __tg_tanpi(__tg_promote1((__x))(__x))
+
+static inline SIMD_CFUNC float __tg_exp10(float x) { return __exp10f(x); }
+static inline SIMD_CFUNC double __tg_exp10(double x) { return __exp10(x); }
+#undef exp10
+/*! @abstract `exp10(x)` computes `10**x` more efficiently and accurately
+ * than `pow(10, x)`.
+ *
+ * @discussion Defined for `float` and `double` as well as vectors of floats
+ * and doubles as provided by `<simd/simd.h>`. */
+#define exp10(__x) __tg_exp10(__tg_promote1((__x))(__x))
+#endif
+
+
+#endif /* !__cplusplus */
+
+#pragma mark - fabs implementation
+static inline SIMD_CFUNC simd_float2 __tg_fabs(simd_float2 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float3 __tg_fabs(simd_float3 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float4 __tg_fabs(simd_float4 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float8 __tg_fabs(simd_float8 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float16 __tg_fabs(simd_float16 x) { return simd_bitselect(0.0, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_double2 __tg_fabs(simd_double2 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double3 __tg_fabs(simd_double3 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double4 __tg_fabs(simd_double4 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double8 __tg_fabs(simd_double8 x) { return simd_bitselect(0.0, x, 0x7fffffffffffffffL); }
+
+#pragma mark - fmin, fmax implementation
+static SIMD_CFUNC simd_float2 __tg_fmin(simd_float2 x, simd_float2 y) {
+#if defined __SSE2__
+ return simd_make_float2(__tg_fmin(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+#elif defined __arm64__
+ return vminnm_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+ return vmin_f32(x, y);
+#else
+ return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_fmin(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_fmin(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+}
+
+static SIMD_CFUNC simd_float4 __tg_fmin(simd_float4 x, simd_float4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+ return _mm_range_ps(x, y, 4);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+ return _mm_min_ps(x, y);
+#elif defined __SSE2__
+ return simd_bitselect(_mm_min_ps(x, y), x, y != y);
+#elif defined __arm64__
+ return vminnmq_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+ return vminq_f32(x, y);
+#else
+ return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_fmin(simd_float8 x, simd_float8 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+ return _mm256_range_ps(x, y, 4);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+ return _mm256_min_ps(x, y);
+#elif defined __AVX__
+ return simd_bitselect(_mm256_min_ps(x, y), x, y != y);
+#else
+ return simd_make_float8(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_fmin(simd_float16 x, simd_float16 y) {
+#if defined __x86_64__ && defined __AVX512DQ__ && !__FINITE_MATH_ONLY__
+ return _mm512_range_ps(x, y, 4);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+ return _mm512_min_ps(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+ return simd_bitselect(_mm512_min_ps(x, y), x, y != y);
+#else
+ return simd_make_float16(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double2 __tg_fmin(simd_double2 x, simd_double2 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+ return _mm_range_pd(x, y, 4);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+ return _mm_min_pd(x, y);
+#elif defined __SSE2__
+ return simd_bitselect(_mm_min_pd(x, y), x, y != y);
+#elif defined __arm64__
+ return vminnmq_f64(x, y);
+#else
+ return simd_bitselect(y, x, (x <= y) | (y != y));
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_fmin(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_fmin(simd_make_double4_undef(x), simd_make_double4_undef(y)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_fmin(simd_double4 x, simd_double4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+ return _mm256_range_pd(x, y, 4);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+ return _mm256_min_pd(x, y);
+#elif defined __AVX__
+ return simd_bitselect(_mm256_min_pd(x, y), x, y != y);
+#else
+ return simd_make_double4(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_fmin(simd_double8 x, simd_double8 y) {
+#if defined __x86_64__ && defined __AVX512DQ__
+ return _mm512_range_pd(x, y, 4);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+ return _mm512_min_pd(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+ return simd_bitselect(_mm512_min_pd(x, y), x, y != y);
+#else
+ return simd_make_double8(__tg_fmin(x.lo, y.lo), __tg_fmin(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_fmax(simd_float2 x, simd_float2 y) {
+#if defined __SSE2__
+ return simd_make_float2(__tg_fmax(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+#elif defined __arm64__
+ return vmaxnm_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+ return vmax_f32(x, y);
+#else
+ return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_fmax(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_fmax(simd_make_float4_undef(x), simd_make_float4_undef(y)));
+}
+
+static SIMD_CFUNC simd_float4 __tg_fmax(simd_float4 x, simd_float4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+ return _mm_range_ps(x, y, 5);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+ return _mm_max_ps(x, y);
+#elif defined __SSE2__
+ return simd_bitselect(_mm_max_ps(x, y), x, y != y);
+#elif defined __arm64__
+ return vmaxnmq_f32(x, y);
+#elif defined __arm__ && __FINITE_MATH_ONLY__
+ return vmaxq_f32(x, y);
+#else
+ return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_fmax(simd_float8 x, simd_float8 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__ && !__FINITE_MATH_ONLY__
+ return _mm256_range_ps(x, y, 5);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+ return _mm256_max_ps(x, y);
+#elif defined __AVX__
+ return simd_bitselect(_mm256_max_ps(x, y), x, y != y);
+#else
+ return simd_make_float8(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_fmax(simd_float16 x, simd_float16 y) {
+#if defined __x86_64__ && defined __AVX512DQ__ && !__FINITE_MATH_ONLY__
+ return _mm512_range_ps(x, y, 5);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+ return _mm512_max_ps(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+ return simd_bitselect(_mm512_max_ps(x, y), x, y != y);
+#else
+ return simd_make_float16(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double2 __tg_fmax(simd_double2 x, simd_double2 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+ return _mm_range_pd(x, y, 5);
+#elif defined __SSE2__ && __FINITE_MATH_ONLY__
+ return _mm_max_pd(x, y);
+#elif defined __SSE2__
+ return simd_bitselect(_mm_max_pd(x, y), x, y != y);
+#elif defined __arm64__
+ return vmaxnmq_f64(x, y);
+#else
+ return simd_bitselect(y, x, (x >= y) | (y != y));
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_fmax(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_fmax(simd_make_double4_undef(x), simd_make_double4_undef(y)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_fmax(simd_double4 x, simd_double4 y) {
+#if defined __AVX512DQ__ && defined __AVX512VL__
+ return _mm256_range_pd(x, y, 5);
+#elif defined __AVX__ && __FINITE_MATH_ONLY__
+ return _mm256_max_pd(x, y);
+#elif defined __AVX__
+ return simd_bitselect(_mm256_max_pd(x, y), x, y != y);
+#else
+ return simd_make_double4(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_fmax(simd_double8 x, simd_double8 y) {
+#if defined __x86_64__ && defined __AVX512DQ__
+ return _mm512_range_pd(x, y, 5);
+#elif defined __x86_64__ && defined __AVX512F__ && __FINITE_MATH_ONLY__
+ return _mm512_max_pd(x, y);
+#elif defined __x86_64__ && defined __AVX512F__
+ return simd_bitselect(_mm512_max_pd(x, y), x, y != y);
+#else
+ return simd_make_double8(__tg_fmax(x.lo, y.lo), __tg_fmax(x.hi, y.hi));
+#endif
+}
+
+#pragma mark - copysign implementation
+static inline SIMD_CFUNC simd_float2 __tg_copysign(simd_float2 x, simd_float2 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float3 __tg_copysign(simd_float3 x, simd_float3 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float4 __tg_copysign(simd_float4 x, simd_float4 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float8 __tg_copysign(simd_float8 x, simd_float8 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_float16 __tg_copysign(simd_float16 x, simd_float16 y) { return simd_bitselect(y, x, 0x7fffffff); }
+static inline SIMD_CFUNC simd_double2 __tg_copysign(simd_double2 x, simd_double2 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double3 __tg_copysign(simd_double3 x, simd_double3 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double4 __tg_copysign(simd_double4 x, simd_double4 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+static inline SIMD_CFUNC simd_double8 __tg_copysign(simd_double8 x, simd_double8 y) { return simd_bitselect(y, x, 0x7fffffffffffffffL); }
+
+#pragma mark - sqrt implementation
+static SIMD_CFUNC simd_float2 __tg_sqrt(simd_float2 x) {
+#if defined __SSE2__
+ return simd_make_float2(__tg_sqrt(simd_make_float4_undef(x)));
+#elif defined __arm64__
+ return vsqrt_f32(x);
+#else
+ return simd_make_float2(sqrt(x.x), sqrt(x.y));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_sqrt(simd_float3 x) {
+ return simd_make_float3(__tg_sqrt(simd_make_float4_undef(x)));
+}
+
+static SIMD_CFUNC simd_float4 __tg_sqrt(simd_float4 x) {
+#if defined __SSE2__
+ return _mm_sqrt_ps(x);
+#elif defined __arm64__
+ return vsqrtq_f32(x);
+#else
+ return simd_make_float4(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_sqrt(simd_float8 x) {
+#if defined __AVX__
+ return _mm256_sqrt_ps(x);
+#else
+ return simd_make_float8(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_sqrt(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_sqrt_ps(x);
+#else
+ return simd_make_float16(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double2 __tg_sqrt(simd_double2 x) {
+#if defined __SSE2__
+ return _mm_sqrt_pd(x);
+#elif defined __arm64__
+ return vsqrtq_f64(x);
+#else
+ return simd_make_double2(sqrt(x.x), sqrt(x.y));
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_sqrt(simd_double3 x) {
+ return simd_make_double3(__tg_sqrt(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_sqrt(simd_double4 x) {
+#if defined __AVX__
+ return _mm256_sqrt_pd(x);
+#else
+ return simd_make_double4(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_sqrt(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_sqrt_pd(x);
+#else
+ return simd_make_double8(__tg_sqrt(x.lo), __tg_sqrt(x.hi));
+#endif
+}
+
+#pragma mark - ceil, floor, rint, trunc implementation
+static SIMD_CFUNC simd_float2 __tg_ceil(simd_float2 x) {
+#if defined __arm64__
+ return vrndp_f32(x);
+#else
+ return simd_make_float2(__tg_ceil(simd_make_float4_undef(x)));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_ceil(simd_float3 x) {
+ return simd_make_float3(__tg_ceil(simd_make_float4_undef(x)));
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_ceil_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_ceil(simd_float4 x) {
+#if defined __SSE4_1__
+ return _mm_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+ return vrndpq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_ceil_f4(x);
+#else
+ simd_float4 truncated = __tg_trunc(x);
+ simd_float4 adjust = simd_bitselect((simd_float4)0, 1, truncated < x);
+ return __tg_copysign(truncated + adjust, x);
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_ceil(simd_float8 x) {
+#if defined __AVX__
+ return _mm256_round_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_float8(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_ceil(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_ps(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_float16(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_ceil_d2(simd_double2 x);
+#endif
+
+static SIMD_CFUNC simd_double2 __tg_ceil(simd_double2 x) {
+#if defined __SSE4_1__
+ return _mm_round_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+ return vrndpq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_ceil_d2(x);
+#else
+ simd_double2 truncated = __tg_trunc(x);
+ simd_double2 adjust = simd_bitselect((simd_double2)0, 1, truncated < x);
+ return __tg_copysign(truncated + adjust, x);
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_ceil(simd_double3 x) {
+ return simd_make_double3(__tg_ceil(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_ceil(simd_double4 x) {
+#if defined __AVX__
+ return _mm256_round_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_double4(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_ceil(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_pd(x, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_double8(__tg_ceil(x.lo), __tg_ceil(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_floor(simd_float2 x) {
+#if defined __arm64__
+ return vrndm_f32(x);
+#else
+ return simd_make_float2(__tg_floor(simd_make_float4_undef(x)));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_floor(simd_float3 x) {
+ return simd_make_float3(__tg_floor(simd_make_float4_undef(x)));
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_floor_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_floor(simd_float4 x) {
+#if defined __SSE4_1__
+ return _mm_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+ return vrndmq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_floor_f4(x);
+#else
+ simd_float4 truncated = __tg_trunc(x);
+ simd_float4 adjust = simd_bitselect((simd_float4)0, 1, truncated > x);
+ return truncated - adjust;
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_floor(simd_float8 x) {
+#if defined __AVX__
+ return _mm256_round_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_float8(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_floor(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_ps(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_float16(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_floor_d2(simd_double2 x);
+#endif
+
+static SIMD_CFUNC simd_double2 __tg_floor(simd_double2 x) {
+#if defined __SSE4_1__
+ return _mm_round_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+ return vrndmq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_floor_d2(x);
+#else
+ simd_double2 truncated = __tg_trunc(x);
+ simd_double2 adjust = simd_bitselect((simd_double2)0, 1, truncated > x);
+ return truncated - adjust;
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_floor(simd_double3 x) {
+ return simd_make_double3(__tg_floor(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_floor(simd_double4 x) {
+#if defined __AVX__
+ return _mm256_round_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_double4(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_floor(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_pd(x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_double8(__tg_floor(x.lo), __tg_floor(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_rint(simd_float2 x) {
+#if defined __arm64__
+ return vrndx_f32(x);
+#else
+ return simd_make_float2(__tg_rint(simd_make_float4_undef(x)));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_rint(simd_float3 x) {
+ return simd_make_float3(__tg_rint(simd_make_float4_undef(x)));
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_rint_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_rint(simd_float4 x) {
+#if defined __SSE4_1__
+ return _mm_round_ps(x, _MM_FROUND_RINT);
+#elif defined __arm64__
+ return vrndxq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_rint_f4(x);
+#else
+ simd_float4 magic = __tg_copysign(0x1.0p23, x);
+ simd_int4 x_is_small = __tg_fabs(x) < 0x1.0p23;
+ return simd_bitselect(x, (x + magic) - magic, x_is_small & 0x7fffffff);
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_rint(simd_float8 x) {
+#if defined __AVX__
+ return _mm256_round_ps(x, _MM_FROUND_RINT);
+#else
+ return simd_make_float8(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_rint(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_ps(x, _MM_FROUND_RINT);
+#else
+ return simd_make_float16(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_rint_d2(simd_double2 x);
+#endif
+
+static SIMD_CFUNC simd_double2 __tg_rint(simd_double2 x) {
+#if defined __SSE4_1__
+ return _mm_round_pd(x, _MM_FROUND_RINT);
+#elif defined __arm64__
+ return vrndxq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_rint_d2(x);
+#else
+ simd_double2 magic = __tg_copysign(0x1.0p52, x);
+ simd_long2 x_is_small = __tg_fabs(x) < 0x1.0p52;
+ return simd_bitselect(x, (x + magic) - magic, x_is_small & 0x7fffffffffffffff);
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_rint(simd_double3 x) {
+ return simd_make_double3(__tg_rint(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_rint(simd_double4 x) {
+#if defined __AVX__
+ return _mm256_round_pd(x, _MM_FROUND_RINT);
+#else
+ return simd_make_double4(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_rint(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_pd(x, _MM_FROUND_RINT);
+#else
+ return simd_make_double8(__tg_rint(x.lo), __tg_rint(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float2 __tg_trunc(simd_float2 x) {
+#if defined __arm64__
+ return vrnd_f32(x);
+#else
+ return simd_make_float2(__tg_trunc(simd_make_float4_undef(x)));
+#endif
+}
+
+static SIMD_CFUNC simd_float3 __tg_trunc(simd_float3 x) {
+ return simd_make_float3(__tg_trunc(simd_make_float4_undef(x)));
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_trunc_f4(simd_float4 x);
+#endif
+
+static SIMD_CFUNC simd_float4 __tg_trunc(simd_float4 x) {
+#if defined __SSE4_1__
+ return _mm_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+ return vrndq_f32(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_trunc_f4(x);
+#else
+ simd_float4 binade = simd_bitselect(0, x, 0x7f800000);
+ simd_int4 mask = (simd_int4)__tg_fmin(-2*binade + 1, -0);
+ simd_float4 result = simd_bitselect(0, x, mask);
+ return simd_bitselect(x, result, binade < 0x1.0p23);
+#endif
+}
+
+static SIMD_CFUNC simd_float8 __tg_trunc(simd_float8 x) {
+#if defined __AVX__
+ return _mm256_round_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_float8(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_float16 __tg_trunc(simd_float16 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_ps(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_float16(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+
+#if defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_trunc_d2(simd_double2 x);
+#endif
+
+static SIMD_CFUNC simd_double2 __tg_trunc(simd_double2 x) {
+#if defined __SSE4_1__
+ return _mm_round_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#elif defined __arm64__
+ return vrndq_f64(x);
+#elif defined __arm__ && SIMD_LIBRARY_VERSION >= 3
+ return _simd_trunc_d2(x);
+#else
+ simd_double2 binade = simd_bitselect(0, x, 0x7ff0000000000000);
+ simd_long2 mask = (simd_long2)__tg_fmin(-2*binade + 1, -0);
+ simd_double2 result = simd_bitselect(0, x, mask);
+ return simd_bitselect(x, result, binade < 0x1.0p52);
+#endif
+}
+
+static SIMD_CFUNC simd_double3 __tg_trunc(simd_double3 x) {
+ return simd_make_double3(__tg_trunc(simd_make_double4_undef(x)));
+}
+
+static SIMD_CFUNC simd_double4 __tg_trunc(simd_double4 x) {
+#if defined __AVX__
+ return _mm256_round_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_double4(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+
+static SIMD_CFUNC simd_double8 __tg_trunc(simd_double8 x) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_roundscale_pd(x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+#else
+ return simd_make_double8(__tg_trunc(x.lo), __tg_trunc(x.hi));
+#endif
+}
+
+#pragma mark - sine, cosine implementation
+static inline SIMD_CFUNC simd_float2 __tg_sin(simd_float2 x) {
+ return simd_make_float2(__tg_sin(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sin(simd_float3 x) {
+ return simd_make_float3(__tg_sin(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+ return _simd_sin_f4(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_float4 __sin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+ return __sin_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sin(simd_float4 x) {
+ return simd_make_float4(sin(x.x), sin(x.y), sin(x.z), sin(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sin_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x) {
+ return _simd_sin_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sin(simd_float8 x) {
+ return simd_make_float8(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sin_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x) {
+ return _simd_sin_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sin(simd_float16 x) {
+ return simd_make_float16(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+ return _simd_sin_d2(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_double2 __sin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+ return __sin_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sin(simd_double2 x) {
+ return simd_make_double2(sin(x.x), sin(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sin(simd_double3 x) {
+ return simd_make_double3(__tg_sin(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sin_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x) {
+ return _simd_sin_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sin(simd_double4 x) {
+ return simd_make_double4(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sin_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x) {
+ return _simd_sin_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sin(simd_double8 x) {
+ return simd_make_double8(__tg_sin(x.lo), __tg_sin(x.hi));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float2 __tg_cos(simd_float2 x) {
+ return simd_make_float2(__tg_cos(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cos(simd_float3 x) {
+ return simd_make_float3(__tg_cos(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+ return _simd_cos_f4(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_float4 __cos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+ return __cos_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cos(simd_float4 x) {
+ return simd_make_float4(cos(x.x), cos(x.y), cos(x.z), cos(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cos_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x) {
+ return _simd_cos_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cos(simd_float8 x) {
+ return simd_make_float8(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cos_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x) {
+ return _simd_cos_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cos(simd_float16 x) {
+ return simd_make_float16(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+ return _simd_cos_d2(x);
+}
+#elif SIMD_LIBRARY_VERSION == 1
+extern simd_double2 __cos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+ return __cos_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cos(simd_double2 x) {
+ return simd_make_double2(cos(x.x), cos(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cos(simd_double3 x) {
+ return simd_make_double3(__tg_cos(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cos_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x) {
+ return _simd_cos_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cos(simd_double4 x) {
+ return simd_make_double4(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cos_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x) {
+ return _simd_cos_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cos(simd_double8 x) {
+ return simd_make_double8(__tg_cos(x.lo), __tg_cos(x.hi));
+}
+#endif
+
+
+#pragma mark - acos implementation
+static inline SIMD_CFUNC simd_float2 __tg_acos(simd_float2 x) {
+ return simd_make_float2(__tg_acos(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_acos(simd_float3 x) {
+ return simd_make_float3(__tg_acos(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_acos_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x) {
+ return _simd_acos_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_acos(simd_float4 x) {
+ return simd_make_float4(acos(x.x), acos(x.y), acos(x.z), acos(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_acos_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x) {
+ return _simd_acos_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_acos(simd_float8 x) {
+ return simd_make_float8(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_acos_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x) {
+ return _simd_acos_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_acos(simd_float16 x) {
+ return simd_make_float16(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_acos_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x) {
+ return _simd_acos_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_acos(simd_double2 x) {
+ return simd_make_double2(acos(x.x), acos(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_acos(simd_double3 x) {
+ return simd_make_double3(__tg_acos(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_acos_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x) {
+ return _simd_acos_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_acos(simd_double4 x) {
+ return simd_make_double4(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_acos_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x) {
+ return _simd_acos_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_acos(simd_double8 x) {
+ return simd_make_double8(__tg_acos(x.lo), __tg_acos(x.hi));
+}
+#endif
+
+#pragma mark - asin implementation
+static inline SIMD_CFUNC simd_float2 __tg_asin(simd_float2 x) {
+ return simd_make_float2(__tg_asin(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_asin(simd_float3 x) {
+ return simd_make_float3(__tg_asin(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_asin_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x) {
+ return _simd_asin_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_asin(simd_float4 x) {
+ return simd_make_float4(asin(x.x), asin(x.y), asin(x.z), asin(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_asin_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x) {
+ return _simd_asin_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_asin(simd_float8 x) {
+ return simd_make_float8(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_asin_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x) {
+ return _simd_asin_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_asin(simd_float16 x) {
+ return simd_make_float16(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_asin_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x) {
+ return _simd_asin_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_asin(simd_double2 x) {
+ return simd_make_double2(asin(x.x), asin(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_asin(simd_double3 x) {
+ return simd_make_double3(__tg_asin(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_asin_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x) {
+ return _simd_asin_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_asin(simd_double4 x) {
+ return simd_make_double4(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_asin_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x) {
+ return _simd_asin_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_asin(simd_double8 x) {
+ return simd_make_double8(__tg_asin(x.lo), __tg_asin(x.hi));
+}
+#endif
+
+#pragma mark - atan implementation
+static inline SIMD_CFUNC simd_float2 __tg_atan(simd_float2 x) {
+ return simd_make_float2(__tg_atan(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atan(simd_float3 x) {
+ return simd_make_float3(__tg_atan(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atan_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x) {
+ return _simd_atan_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atan(simd_float4 x) {
+ return simd_make_float4(atan(x.x), atan(x.y), atan(x.z), atan(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atan_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x) {
+ return _simd_atan_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atan(simd_float8 x) {
+ return simd_make_float8(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atan_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x) {
+ return _simd_atan_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atan(simd_float16 x) {
+ return simd_make_float16(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atan_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x) {
+ return _simd_atan_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atan(simd_double2 x) {
+ return simd_make_double2(atan(x.x), atan(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atan(simd_double3 x) {
+ return simd_make_double3(__tg_atan(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atan_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x) {
+ return _simd_atan_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atan(simd_double4 x) {
+ return simd_make_double4(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atan_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x) {
+ return _simd_atan_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atan(simd_double8 x) {
+ return simd_make_double8(__tg_atan(x.lo), __tg_atan(x.hi));
+}
+#endif
+
+#pragma mark - tan implementation
+static inline SIMD_CFUNC simd_float2 __tg_tan(simd_float2 x) {
+ return simd_make_float2(__tg_tan(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tan(simd_float3 x) {
+ return simd_make_float3(__tg_tan(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tan_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x) {
+ return _simd_tan_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tan(simd_float4 x) {
+ return simd_make_float4(tan(x.x), tan(x.y), tan(x.z), tan(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tan_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x) {
+ return _simd_tan_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tan(simd_float8 x) {
+ return simd_make_float8(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tan_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x) {
+ return _simd_tan_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tan(simd_float16 x) {
+ return simd_make_float16(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tan_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x) {
+ return _simd_tan_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tan(simd_double2 x) {
+ return simd_make_double2(tan(x.x), tan(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tan(simd_double3 x) {
+ return simd_make_double3(__tg_tan(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tan_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x) {
+ return _simd_tan_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tan(simd_double4 x) {
+ return simd_make_double4(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tan_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x) {
+ return _simd_tan_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tan(simd_double8 x) {
+ return simd_make_double8(__tg_tan(x.lo), __tg_tan(x.hi));
+}
+#endif
+
+#pragma mark - cospi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_cospi(simd_float2 x) {
+ return simd_make_float2(__tg_cospi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cospi(simd_float3 x) {
+ return simd_make_float3(__tg_cospi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cospi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x) {
+ return _simd_cospi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cospi(simd_float4 x) {
+ return simd_make_float4(__cospi(x.x), __cospi(x.y), __cospi(x.z), __cospi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cospi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x) {
+ return _simd_cospi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cospi(simd_float8 x) {
+ return simd_make_float8(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cospi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x) {
+ return _simd_cospi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cospi(simd_float16 x) {
+ return simd_make_float16(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cospi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x) {
+ return _simd_cospi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cospi(simd_double2 x) {
+ return simd_make_double2(__cospi(x.x), __cospi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cospi(simd_double3 x) {
+ return simd_make_double3(__tg_cospi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cospi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x) {
+ return _simd_cospi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cospi(simd_double4 x) {
+ return simd_make_double4(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cospi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x) {
+ return _simd_cospi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cospi(simd_double8 x) {
+ return simd_make_double8(__tg_cospi(x.lo), __tg_cospi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - sinpi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_sinpi(simd_float2 x) {
+ return simd_make_float2(__tg_sinpi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sinpi(simd_float3 x) {
+ return simd_make_float3(__tg_sinpi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sinpi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x) {
+ return _simd_sinpi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sinpi(simd_float4 x) {
+ return simd_make_float4(__sinpi(x.x), __sinpi(x.y), __sinpi(x.z), __sinpi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sinpi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x) {
+ return _simd_sinpi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sinpi(simd_float8 x) {
+ return simd_make_float8(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sinpi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x) {
+ return _simd_sinpi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sinpi(simd_float16 x) {
+ return simd_make_float16(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sinpi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x) {
+ return _simd_sinpi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sinpi(simd_double2 x) {
+ return simd_make_double2(__sinpi(x.x), __sinpi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sinpi(simd_double3 x) {
+ return simd_make_double3(__tg_sinpi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sinpi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x) {
+ return _simd_sinpi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sinpi(simd_double4 x) {
+ return simd_make_double4(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sinpi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x) {
+ return _simd_sinpi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sinpi(simd_double8 x) {
+ return simd_make_double8(__tg_sinpi(x.lo), __tg_sinpi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - tanpi implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_tanpi(simd_float2 x) {
+ return simd_make_float2(__tg_tanpi(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tanpi(simd_float3 x) {
+ return simd_make_float3(__tg_tanpi(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tanpi_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x) {
+ return _simd_tanpi_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tanpi(simd_float4 x) {
+ return simd_make_float4(__tanpi(x.x), __tanpi(x.y), __tanpi(x.z), __tanpi(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tanpi_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x) {
+ return _simd_tanpi_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tanpi(simd_float8 x) {
+ return simd_make_float8(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tanpi_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x) {
+ return _simd_tanpi_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tanpi(simd_float16 x) {
+ return simd_make_float16(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tanpi_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x) {
+ return _simd_tanpi_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tanpi(simd_double2 x) {
+ return simd_make_double2(__tanpi(x.x), __tanpi(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tanpi(simd_double3 x) {
+ return simd_make_double3(__tg_tanpi(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tanpi_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x) {
+ return _simd_tanpi_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tanpi(simd_double4 x) {
+ return simd_make_double4(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tanpi_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x) {
+ return _simd_tanpi_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tanpi(simd_double8 x) {
+ return simd_make_double8(__tg_tanpi(x.lo), __tg_tanpi(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - acosh implementation
+static inline SIMD_CFUNC simd_float2 __tg_acosh(simd_float2 x) {
+ return simd_make_float2(__tg_acosh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_acosh(simd_float3 x) {
+ return simd_make_float3(__tg_acosh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_acosh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x) {
+ return _simd_acosh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_acosh(simd_float4 x) {
+ return simd_make_float4(acosh(x.x), acosh(x.y), acosh(x.z), acosh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_acosh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x) {
+ return _simd_acosh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_acosh(simd_float8 x) {
+ return simd_make_float8(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_acosh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x) {
+ return _simd_acosh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_acosh(simd_float16 x) {
+ return simd_make_float16(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_acosh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x) {
+ return _simd_acosh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_acosh(simd_double2 x) {
+ return simd_make_double2(acosh(x.x), acosh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_acosh(simd_double3 x) {
+ return simd_make_double3(__tg_acosh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_acosh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x) {
+ return _simd_acosh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_acosh(simd_double4 x) {
+ return simd_make_double4(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_acosh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x) {
+ return _simd_acosh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_acosh(simd_double8 x) {
+ return simd_make_double8(__tg_acosh(x.lo), __tg_acosh(x.hi));
+}
+#endif
+
+#pragma mark - asinh implementation
+static inline SIMD_CFUNC simd_float2 __tg_asinh(simd_float2 x) {
+ return simd_make_float2(__tg_asinh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_asinh(simd_float3 x) {
+ return simd_make_float3(__tg_asinh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_asinh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x) {
+ return _simd_asinh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_asinh(simd_float4 x) {
+ return simd_make_float4(asinh(x.x), asinh(x.y), asinh(x.z), asinh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_asinh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x) {
+ return _simd_asinh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_asinh(simd_float8 x) {
+ return simd_make_float8(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_asinh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x) {
+ return _simd_asinh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_asinh(simd_float16 x) {
+ return simd_make_float16(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_asinh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x) {
+ return _simd_asinh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_asinh(simd_double2 x) {
+ return simd_make_double2(asinh(x.x), asinh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_asinh(simd_double3 x) {
+ return simd_make_double3(__tg_asinh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_asinh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x) {
+ return _simd_asinh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_asinh(simd_double4 x) {
+ return simd_make_double4(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_asinh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x) {
+ return _simd_asinh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_asinh(simd_double8 x) {
+ return simd_make_double8(__tg_asinh(x.lo), __tg_asinh(x.hi));
+}
+#endif
+
+#pragma mark - atanh implementation
+static inline SIMD_CFUNC simd_float2 __tg_atanh(simd_float2 x) {
+ return simd_make_float2(__tg_atanh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atanh(simd_float3 x) {
+ return simd_make_float3(__tg_atanh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atanh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x) {
+ return _simd_atanh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atanh(simd_float4 x) {
+ return simd_make_float4(atanh(x.x), atanh(x.y), atanh(x.z), atanh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atanh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x) {
+ return _simd_atanh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atanh(simd_float8 x) {
+ return simd_make_float8(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atanh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x) {
+ return _simd_atanh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atanh(simd_float16 x) {
+ return simd_make_float16(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atanh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x) {
+ return _simd_atanh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atanh(simd_double2 x) {
+ return simd_make_double2(atanh(x.x), atanh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atanh(simd_double3 x) {
+ return simd_make_double3(__tg_atanh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atanh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x) {
+ return _simd_atanh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atanh(simd_double4 x) {
+ return simd_make_double4(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atanh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x) {
+ return _simd_atanh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atanh(simd_double8 x) {
+ return simd_make_double8(__tg_atanh(x.lo), __tg_atanh(x.hi));
+}
+#endif
+
+#pragma mark - cosh implementation
+static inline SIMD_CFUNC simd_float2 __tg_cosh(simd_float2 x) {
+ return simd_make_float2(__tg_cosh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cosh(simd_float3 x) {
+ return simd_make_float3(__tg_cosh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cosh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x) {
+ return _simd_cosh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cosh(simd_float4 x) {
+ return simd_make_float4(cosh(x.x), cosh(x.y), cosh(x.z), cosh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cosh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x) {
+ return _simd_cosh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cosh(simd_float8 x) {
+ return simd_make_float8(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cosh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x) {
+ return _simd_cosh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cosh(simd_float16 x) {
+ return simd_make_float16(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cosh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x) {
+ return _simd_cosh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cosh(simd_double2 x) {
+ return simd_make_double2(cosh(x.x), cosh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cosh(simd_double3 x) {
+ return simd_make_double3(__tg_cosh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cosh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x) {
+ return _simd_cosh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cosh(simd_double4 x) {
+ return simd_make_double4(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cosh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x) {
+ return _simd_cosh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cosh(simd_double8 x) {
+ return simd_make_double8(__tg_cosh(x.lo), __tg_cosh(x.hi));
+}
+#endif
+
+#pragma mark - sinh implementation
+static inline SIMD_CFUNC simd_float2 __tg_sinh(simd_float2 x) {
+ return simd_make_float2(__tg_sinh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_sinh(simd_float3 x) {
+ return simd_make_float3(__tg_sinh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_sinh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x) {
+ return _simd_sinh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_sinh(simd_float4 x) {
+ return simd_make_float4(sinh(x.x), sinh(x.y), sinh(x.z), sinh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_sinh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x) {
+ return _simd_sinh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_sinh(simd_float8 x) {
+ return simd_make_float8(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_sinh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x) {
+ return _simd_sinh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_sinh(simd_float16 x) {
+ return simd_make_float16(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_sinh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x) {
+ return _simd_sinh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_sinh(simd_double2 x) {
+ return simd_make_double2(sinh(x.x), sinh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_sinh(simd_double3 x) {
+ return simd_make_double3(__tg_sinh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_sinh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x) {
+ return _simd_sinh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_sinh(simd_double4 x) {
+ return simd_make_double4(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_sinh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x) {
+ return _simd_sinh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_sinh(simd_double8 x) {
+ return simd_make_double8(__tg_sinh(x.lo), __tg_sinh(x.hi));
+}
+#endif
+
+#pragma mark - tanh implementation
+static inline SIMD_CFUNC simd_float2 __tg_tanh(simd_float2 x) {
+ return simd_make_float2(__tg_tanh(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tanh(simd_float3 x) {
+ return simd_make_float3(__tg_tanh(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tanh_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x) {
+ return _simd_tanh_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tanh(simd_float4 x) {
+ return simd_make_float4(tanh(x.x), tanh(x.y), tanh(x.z), tanh(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tanh_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x) {
+ return _simd_tanh_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tanh(simd_float8 x) {
+ return simd_make_float8(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tanh_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x) {
+ return _simd_tanh_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tanh(simd_float16 x) {
+ return simd_make_float16(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tanh_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x) {
+ return _simd_tanh_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tanh(simd_double2 x) {
+ return simd_make_double2(tanh(x.x), tanh(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tanh(simd_double3 x) {
+ return simd_make_double3(__tg_tanh(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tanh_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x) {
+ return _simd_tanh_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tanh(simd_double4 x) {
+ return simd_make_double4(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tanh_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x) {
+ return _simd_tanh_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tanh(simd_double8 x) {
+ return simd_make_double8(__tg_tanh(x.lo), __tg_tanh(x.hi));
+}
+#endif
+
+#pragma mark - exp implementation
+static inline SIMD_CFUNC simd_float2 __tg_exp(simd_float2 x) {
+ return simd_make_float2(__tg_exp(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp(simd_float3 x) {
+ return simd_make_float3(__tg_exp(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x) {
+ return _simd_exp_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp(simd_float4 x) {
+ return simd_make_float4(exp(x.x), exp(x.y), exp(x.z), exp(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x) {
+ return _simd_exp_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp(simd_float8 x) {
+ return simd_make_float8(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x) {
+ return _simd_exp_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp(simd_float16 x) {
+ return simd_make_float16(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x) {
+ return _simd_exp_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp(simd_double2 x) {
+ return simd_make_double2(exp(x.x), exp(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp(simd_double3 x) {
+ return simd_make_double3(__tg_exp(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x) {
+ return _simd_exp_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp(simd_double4 x) {
+ return simd_make_double4(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x) {
+ return _simd_exp_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp(simd_double8 x) {
+ return simd_make_double8(__tg_exp(x.lo), __tg_exp(x.hi));
+}
+#endif
+
+#pragma mark - exp2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_exp2(simd_float2 x) {
+ return simd_make_float2(__tg_exp2(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp2(simd_float3 x) {
+ return simd_make_float3(__tg_exp2(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp2_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x) {
+ return _simd_exp2_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp2(simd_float4 x) {
+ return simd_make_float4(exp2(x.x), exp2(x.y), exp2(x.z), exp2(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp2_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x) {
+ return _simd_exp2_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp2(simd_float8 x) {
+ return simd_make_float8(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp2_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x) {
+ return _simd_exp2_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp2(simd_float16 x) {
+ return simd_make_float16(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp2_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x) {
+ return _simd_exp2_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp2(simd_double2 x) {
+ return simd_make_double2(exp2(x.x), exp2(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp2(simd_double3 x) {
+ return simd_make_double3(__tg_exp2(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp2_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x) {
+ return _simd_exp2_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp2(simd_double4 x) {
+ return simd_make_double4(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp2_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x) {
+ return _simd_exp2_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp2(simd_double8 x) {
+ return simd_make_double8(__tg_exp2(x.lo), __tg_exp2(x.hi));
+}
+#endif
+
+#pragma mark - exp10 implementation
+#if SIMD_LIBRARY_VERSION >= 1
+static inline SIMD_CFUNC simd_float2 __tg_exp10(simd_float2 x) {
+ return simd_make_float2(__tg_exp10(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_exp10(simd_float3 x) {
+ return simd_make_float3(__tg_exp10(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_exp10_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x) {
+ return _simd_exp10_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_exp10(simd_float4 x) {
+ return simd_make_float4(__exp10(x.x), __exp10(x.y), __exp10(x.z), __exp10(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_exp10_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x) {
+ return _simd_exp10_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_exp10(simd_float8 x) {
+ return simd_make_float8(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_exp10_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x) {
+ return _simd_exp10_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_exp10(simd_float16 x) {
+ return simd_make_float16(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_exp10_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x) {
+ return _simd_exp10_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_exp10(simd_double2 x) {
+ return simd_make_double2(__exp10(x.x), __exp10(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_exp10(simd_double3 x) {
+ return simd_make_double3(__tg_exp10(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_exp10_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x) {
+ return _simd_exp10_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_exp10(simd_double4 x) {
+ return simd_make_double4(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_exp10_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x) {
+ return _simd_exp10_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_exp10(simd_double8 x) {
+ return simd_make_double8(__tg_exp10(x.lo), __tg_exp10(x.hi));
+}
+#endif
+
+#endif /* SIMD_LIBRARY_VERSION */
+#pragma mark - expm1 implementation
+static inline SIMD_CFUNC simd_float2 __tg_expm1(simd_float2 x) {
+ return simd_make_float2(__tg_expm1(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_expm1(simd_float3 x) {
+ return simd_make_float3(__tg_expm1(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_expm1_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x) {
+ return _simd_expm1_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_expm1(simd_float4 x) {
+ return simd_make_float4(expm1(x.x), expm1(x.y), expm1(x.z), expm1(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_expm1_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x) {
+ return _simd_expm1_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_expm1(simd_float8 x) {
+ return simd_make_float8(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_expm1_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x) {
+ return _simd_expm1_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_expm1(simd_float16 x) {
+ return simd_make_float16(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_expm1_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x) {
+ return _simd_expm1_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_expm1(simd_double2 x) {
+ return simd_make_double2(expm1(x.x), expm1(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_expm1(simd_double3 x) {
+ return simd_make_double3(__tg_expm1(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_expm1_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x) {
+ return _simd_expm1_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_expm1(simd_double4 x) {
+ return simd_make_double4(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_expm1_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x) {
+ return _simd_expm1_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_expm1(simd_double8 x) {
+ return simd_make_double8(__tg_expm1(x.lo), __tg_expm1(x.hi));
+}
+#endif
+
+#pragma mark - log implementation
+static inline SIMD_CFUNC simd_float2 __tg_log(simd_float2 x) {
+ return simd_make_float2(__tg_log(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log(simd_float3 x) {
+ return simd_make_float3(__tg_log(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x) {
+ return _simd_log_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log(simd_float4 x) {
+ return simd_make_float4(log(x.x), log(x.y), log(x.z), log(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x) {
+ return _simd_log_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log(simd_float8 x) {
+ return simd_make_float8(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x) {
+ return _simd_log_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log(simd_float16 x) {
+ return simd_make_float16(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x) {
+ return _simd_log_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log(simd_double2 x) {
+ return simd_make_double2(log(x.x), log(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log(simd_double3 x) {
+ return simd_make_double3(__tg_log(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x) {
+ return _simd_log_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log(simd_double4 x) {
+ return simd_make_double4(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x) {
+ return _simd_log_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log(simd_double8 x) {
+ return simd_make_double8(__tg_log(x.lo), __tg_log(x.hi));
+}
+#endif
+
+#pragma mark - log2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_log2(simd_float2 x) {
+ return simd_make_float2(__tg_log2(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log2(simd_float3 x) {
+ return simd_make_float3(__tg_log2(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log2_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x) {
+ return _simd_log2_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log2(simd_float4 x) {
+ return simd_make_float4(log2(x.x), log2(x.y), log2(x.z), log2(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log2_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x) {
+ return _simd_log2_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log2(simd_float8 x) {
+ return simd_make_float8(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log2_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x) {
+ return _simd_log2_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log2(simd_float16 x) {
+ return simd_make_float16(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log2_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x) {
+ return _simd_log2_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log2(simd_double2 x) {
+ return simd_make_double2(log2(x.x), log2(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log2(simd_double3 x) {
+ return simd_make_double3(__tg_log2(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log2_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x) {
+ return _simd_log2_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log2(simd_double4 x) {
+ return simd_make_double4(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log2_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x) {
+ return _simd_log2_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log2(simd_double8 x) {
+ return simd_make_double8(__tg_log2(x.lo), __tg_log2(x.hi));
+}
+#endif
+
+#pragma mark - log10 implementation
+static inline SIMD_CFUNC simd_float2 __tg_log10(simd_float2 x) {
+ return simd_make_float2(__tg_log10(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log10(simd_float3 x) {
+ return simd_make_float3(__tg_log10(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log10_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x) {
+ return _simd_log10_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log10(simd_float4 x) {
+ return simd_make_float4(log10(x.x), log10(x.y), log10(x.z), log10(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log10_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x) {
+ return _simd_log10_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log10(simd_float8 x) {
+ return simd_make_float8(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log10_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x) {
+ return _simd_log10_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log10(simd_float16 x) {
+ return simd_make_float16(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log10_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x) {
+ return _simd_log10_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log10(simd_double2 x) {
+ return simd_make_double2(log10(x.x), log10(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log10(simd_double3 x) {
+ return simd_make_double3(__tg_log10(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log10_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x) {
+ return _simd_log10_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log10(simd_double4 x) {
+ return simd_make_double4(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log10_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x) {
+ return _simd_log10_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log10(simd_double8 x) {
+ return simd_make_double8(__tg_log10(x.lo), __tg_log10(x.hi));
+}
+#endif
+
+#pragma mark - log1p implementation
+static inline SIMD_CFUNC simd_float2 __tg_log1p(simd_float2 x) {
+ return simd_make_float2(__tg_log1p(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_log1p(simd_float3 x) {
+ return simd_make_float3(__tg_log1p(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_log1p_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x) {
+ return _simd_log1p_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_log1p(simd_float4 x) {
+ return simd_make_float4(log1p(x.x), log1p(x.y), log1p(x.z), log1p(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_log1p_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x) {
+ return _simd_log1p_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_log1p(simd_float8 x) {
+ return simd_make_float8(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_log1p_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x) {
+ return _simd_log1p_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_log1p(simd_float16 x) {
+ return simd_make_float16(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_log1p_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x) {
+ return _simd_log1p_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_log1p(simd_double2 x) {
+ return simd_make_double2(log1p(x.x), log1p(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_log1p(simd_double3 x) {
+ return simd_make_double3(__tg_log1p(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_log1p_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x) {
+ return _simd_log1p_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_log1p(simd_double4 x) {
+ return simd_make_double4(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_log1p_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x) {
+ return _simd_log1p_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_log1p(simd_double8 x) {
+ return simd_make_double8(__tg_log1p(x.lo), __tg_log1p(x.hi));
+}
+#endif
+
+#pragma mark - cbrt implementation
+static inline SIMD_CFUNC simd_float2 __tg_cbrt(simd_float2 x) {
+ return simd_make_float2(__tg_cbrt(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_cbrt(simd_float3 x) {
+ return simd_make_float3(__tg_cbrt(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_cbrt_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x) {
+ return _simd_cbrt_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_cbrt(simd_float4 x) {
+ return simd_make_float4(cbrt(x.x), cbrt(x.y), cbrt(x.z), cbrt(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_cbrt_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x) {
+ return _simd_cbrt_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_cbrt(simd_float8 x) {
+ return simd_make_float8(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_cbrt_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x) {
+ return _simd_cbrt_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_cbrt(simd_float16 x) {
+ return simd_make_float16(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_cbrt_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x) {
+ return _simd_cbrt_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_cbrt(simd_double2 x) {
+ return simd_make_double2(cbrt(x.x), cbrt(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_cbrt(simd_double3 x) {
+ return simd_make_double3(__tg_cbrt(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_cbrt_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x) {
+ return _simd_cbrt_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_cbrt(simd_double4 x) {
+ return simd_make_double4(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_cbrt_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x) {
+ return _simd_cbrt_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_cbrt(simd_double8 x) {
+ return simd_make_double8(__tg_cbrt(x.lo), __tg_cbrt(x.hi));
+}
+#endif
+
+#pragma mark - erf implementation
+static inline SIMD_CFUNC simd_float2 __tg_erf(simd_float2 x) {
+ return simd_make_float2(__tg_erf(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_erf(simd_float3 x) {
+ return simd_make_float3(__tg_erf(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_erf_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x) {
+ return _simd_erf_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_erf(simd_float4 x) {
+ return simd_make_float4(erf(x.x), erf(x.y), erf(x.z), erf(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_erf_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x) {
+ return _simd_erf_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_erf(simd_float8 x) {
+ return simd_make_float8(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_erf_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x) {
+ return _simd_erf_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_erf(simd_float16 x) {
+ return simd_make_float16(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_erf_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x) {
+ return _simd_erf_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_erf(simd_double2 x) {
+ return simd_make_double2(erf(x.x), erf(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_erf(simd_double3 x) {
+ return simd_make_double3(__tg_erf(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_erf_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x) {
+ return _simd_erf_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_erf(simd_double4 x) {
+ return simd_make_double4(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_erf_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x) {
+ return _simd_erf_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_erf(simd_double8 x) {
+ return simd_make_double8(__tg_erf(x.lo), __tg_erf(x.hi));
+}
+#endif
+
+#pragma mark - erfc implementation
+static inline SIMD_CFUNC simd_float2 __tg_erfc(simd_float2 x) {
+ return simd_make_float2(__tg_erfc(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_erfc(simd_float3 x) {
+ return simd_make_float3(__tg_erfc(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_erfc_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x) {
+ return _simd_erfc_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_erfc(simd_float4 x) {
+ return simd_make_float4(erfc(x.x), erfc(x.y), erfc(x.z), erfc(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_erfc_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x) {
+ return _simd_erfc_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_erfc(simd_float8 x) {
+ return simd_make_float8(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_erfc_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x) {
+ return _simd_erfc_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_erfc(simd_float16 x) {
+ return simd_make_float16(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_erfc_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x) {
+ return _simd_erfc_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_erfc(simd_double2 x) {
+ return simd_make_double2(erfc(x.x), erfc(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_erfc(simd_double3 x) {
+ return simd_make_double3(__tg_erfc(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_erfc_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x) {
+ return _simd_erfc_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_erfc(simd_double4 x) {
+ return simd_make_double4(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_erfc_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x) {
+ return _simd_erfc_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_erfc(simd_double8 x) {
+ return simd_make_double8(__tg_erfc(x.lo), __tg_erfc(x.hi));
+}
+#endif
+
+#pragma mark - tgamma implementation
+static inline SIMD_CFUNC simd_float2 __tg_tgamma(simd_float2 x) {
+ return simd_make_float2(__tg_tgamma(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_tgamma(simd_float3 x) {
+ return simd_make_float3(__tg_tgamma(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_tgamma_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x) {
+ return _simd_tgamma_f4(x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_tgamma(simd_float4 x) {
+ return simd_make_float4(tgamma(x.x), tgamma(x.y), tgamma(x.z), tgamma(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_tgamma_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x) {
+ return _simd_tgamma_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_tgamma(simd_float8 x) {
+ return simd_make_float8(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_tgamma_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x) {
+ return _simd_tgamma_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_tgamma(simd_float16 x) {
+ return simd_make_float16(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_tgamma_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x) {
+ return _simd_tgamma_d2(x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_tgamma(simd_double2 x) {
+ return simd_make_double2(tgamma(x.x), tgamma(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_tgamma(simd_double3 x) {
+ return simd_make_double3(__tg_tgamma(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_tgamma_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x) {
+ return _simd_tgamma_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_tgamma(simd_double4 x) {
+ return simd_make_double4(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_tgamma_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x) {
+ return _simd_tgamma_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_tgamma(simd_double8 x) {
+ return simd_make_double8(__tg_tgamma(x.lo), __tg_tgamma(x.hi));
+}
+#endif
+
+#pragma mark - round implementation
+static inline SIMD_CFUNC simd_float2 __tg_round(simd_float2 x) {
+ return simd_make_float2(__tg_round(simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_round(simd_float3 x) {
+ return simd_make_float3(__tg_round(simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_round_f4(simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x) {
+#if defined __arm64__
+ return vrndaq_f32(x);
+#else
+ return _simd_round_f4(x);
+#endif
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_round(simd_float4 x) {
+ return simd_make_float4(round(x.x), round(x.y), round(x.z), round(x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_round_f8(simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x) {
+ return _simd_round_f8(x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_round(simd_float8 x) {
+ return simd_make_float8(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_round_f16(simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x) {
+ return _simd_round_f16(x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_round(simd_float16 x) {
+ return simd_make_float16(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_round_d2(simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x) {
+#if defined __arm64__
+ return vrndaq_f64(x);
+#else
+ return _simd_round_d2(x);
+#endif
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_round(simd_double2 x) {
+ return simd_make_double2(round(x.x), round(x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_round(simd_double3 x) {
+ return simd_make_double3(__tg_round(simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_round_d4(simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x) {
+ return _simd_round_d4(x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_round(simd_double4 x) {
+ return simd_make_double4(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_round_d8(simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x) {
+ return _simd_round_d8(x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_round(simd_double8 x) {
+ return simd_make_double8(__tg_round(x.lo), __tg_round(x.hi));
+}
+#endif
+
+#pragma mark - atan2 implementation
+static inline SIMD_CFUNC simd_float2 __tg_atan2(simd_float2 y, simd_float2 x) {
+ return simd_make_float2(__tg_atan2(simd_make_float4(y), simd_make_float4(x)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_atan2(simd_float3 y, simd_float3 x) {
+ return simd_make_float3(__tg_atan2(simd_make_float4(y), simd_make_float4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_atan2_f4(simd_float4 y, simd_float4 x);
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x) {
+ return _simd_atan2_f4(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_atan2(simd_float4 y, simd_float4 x) {
+ return simd_make_float4(atan2(y.x, x.x), atan2(y.y, x.y), atan2(y.z, x.z), atan2(y.w, x.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_atan2_f8(simd_float8 y, simd_float8 x);
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x) {
+ return _simd_atan2_f8(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_atan2(simd_float8 y, simd_float8 x) {
+ return simd_make_float8(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_atan2_f16(simd_float16 y, simd_float16 x);
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x) {
+ return _simd_atan2_f16(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_atan2(simd_float16 y, simd_float16 x) {
+ return simd_make_float16(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_atan2_d2(simd_double2 y, simd_double2 x);
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x) {
+ return _simd_atan2_d2(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_atan2(simd_double2 y, simd_double2 x) {
+ return simd_make_double2(atan2(y.x, x.x), atan2(y.y, x.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_atan2(simd_double3 y, simd_double3 x) {
+ return simd_make_double3(__tg_atan2(simd_make_double4(y), simd_make_double4(x)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_atan2_d4(simd_double4 y, simd_double4 x);
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x) {
+ return _simd_atan2_d4(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_atan2(simd_double4 y, simd_double4 x) {
+ return simd_make_double4(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_atan2_d8(simd_double8 y, simd_double8 x);
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x) {
+ return _simd_atan2_d8(y, x);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_atan2(simd_double8 y, simd_double8 x) {
+ return simd_make_double8(__tg_atan2(y.lo, x.lo), __tg_atan2(y.hi, x.hi));
+}
+#endif
+
+#pragma mark - hypot implementation
+static inline SIMD_CFUNC simd_float2 __tg_hypot(simd_float2 x, simd_float2 y) {
+ return simd_make_float2(__tg_hypot(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_hypot(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_hypot(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_hypot_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y) {
+ return _simd_hypot_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_hypot(simd_float4 x, simd_float4 y) {
+ return simd_make_float4(hypot(x.x, y.x), hypot(x.y, y.y), hypot(x.z, y.z), hypot(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_hypot_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y) {
+ return _simd_hypot_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_hypot(simd_float8 x, simd_float8 y) {
+ return simd_make_float8(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_hypot_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y) {
+ return _simd_hypot_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_hypot(simd_float16 x, simd_float16 y) {
+ return simd_make_float16(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_hypot_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y) {
+ return _simd_hypot_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_hypot(simd_double2 x, simd_double2 y) {
+ return simd_make_double2(hypot(x.x, y.x), hypot(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_hypot(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_hypot(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_hypot_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y) {
+ return _simd_hypot_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_hypot(simd_double4 x, simd_double4 y) {
+ return simd_make_double4(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_hypot_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y) {
+ return _simd_hypot_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_hypot(simd_double8 x, simd_double8 y) {
+ return simd_make_double8(__tg_hypot(x.lo, y.lo), __tg_hypot(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - pow implementation
+static inline SIMD_CFUNC simd_float2 __tg_pow(simd_float2 x, simd_float2 y) {
+ return simd_make_float2(__tg_pow(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_pow(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_pow(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_pow_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y) {
+ return _simd_pow_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_pow(simd_float4 x, simd_float4 y) {
+ return simd_make_float4(pow(x.x, y.x), pow(x.y, y.y), pow(x.z, y.z), pow(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_pow_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y) {
+ return _simd_pow_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_pow(simd_float8 x, simd_float8 y) {
+ return simd_make_float8(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_pow_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y) {
+ return _simd_pow_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_pow(simd_float16 x, simd_float16 y) {
+ return simd_make_float16(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_pow_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y) {
+ return _simd_pow_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_pow(simd_double2 x, simd_double2 y) {
+ return simd_make_double2(pow(x.x, y.x), pow(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_pow(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_pow(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_pow_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y) {
+ return _simd_pow_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_pow(simd_double4 x, simd_double4 y) {
+ return simd_make_double4(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_pow_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y) {
+ return _simd_pow_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_pow(simd_double8 x, simd_double8 y) {
+ return simd_make_double8(__tg_pow(x.lo, y.lo), __tg_pow(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - fmod implementation
+static inline SIMD_CFUNC simd_float2 __tg_fmod(simd_float2 x, simd_float2 y) {
+ return simd_make_float2(__tg_fmod(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_fmod(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_fmod(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_fmod_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y) {
+ return _simd_fmod_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_fmod(simd_float4 x, simd_float4 y) {
+ return simd_make_float4(fmod(x.x, y.x), fmod(x.y, y.y), fmod(x.z, y.z), fmod(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_fmod_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y) {
+ return _simd_fmod_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_fmod(simd_float8 x, simd_float8 y) {
+ return simd_make_float8(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_fmod_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y) {
+ return _simd_fmod_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_fmod(simd_float16 x, simd_float16 y) {
+ return simd_make_float16(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_fmod_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y) {
+ return _simd_fmod_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_fmod(simd_double2 x, simd_double2 y) {
+ return simd_make_double2(fmod(x.x, y.x), fmod(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_fmod(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_fmod(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_fmod_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y) {
+ return _simd_fmod_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_fmod(simd_double4 x, simd_double4 y) {
+ return simd_make_double4(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_fmod_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y) {
+ return _simd_fmod_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_fmod(simd_double8 x, simd_double8 y) {
+ return simd_make_double8(__tg_fmod(x.lo, y.lo), __tg_fmod(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - remainder implementation
+static inline SIMD_CFUNC simd_float2 __tg_remainder(simd_float2 x, simd_float2 y) {
+ return simd_make_float2(__tg_remainder(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_remainder(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_remainder(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_remainder_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y) {
+ return _simd_remainder_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_remainder(simd_float4 x, simd_float4 y) {
+ return simd_make_float4(remainder(x.x, y.x), remainder(x.y, y.y), remainder(x.z, y.z), remainder(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_remainder_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y) {
+ return _simd_remainder_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_remainder(simd_float8 x, simd_float8 y) {
+ return simd_make_float8(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_remainder_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y) {
+ return _simd_remainder_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_remainder(simd_float16 x, simd_float16 y) {
+ return simd_make_float16(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_remainder_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y) {
+ return _simd_remainder_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_remainder(simd_double2 x, simd_double2 y) {
+ return simd_make_double2(remainder(x.x, y.x), remainder(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_remainder(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_remainder(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_remainder_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y) {
+ return _simd_remainder_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_remainder(simd_double4 x, simd_double4 y) {
+ return simd_make_double4(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_remainder_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y) {
+ return _simd_remainder_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_remainder(simd_double8 x, simd_double8 y) {
+ return simd_make_double8(__tg_remainder(x.lo, y.lo), __tg_remainder(x.hi, y.hi));
+}
+#endif
+
+#pragma mark - nextafter implementation
+static inline SIMD_CFUNC simd_float2 __tg_nextafter(simd_float2 x, simd_float2 y) {
+ return simd_make_float2(__tg_nextafter(simd_make_float4(x), simd_make_float4(y)));
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_nextafter(simd_float3 x, simd_float3 y) {
+ return simd_make_float3(__tg_nextafter(simd_make_float4(x), simd_make_float4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_nextafter_f4(simd_float4 x, simd_float4 y);
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y) {
+ return _simd_nextafter_f4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float4 __tg_nextafter(simd_float4 x, simd_float4 y) {
+ return simd_make_float4(nextafter(x.x, y.x), nextafter(x.y, y.y), nextafter(x.z, y.z), nextafter(x.w, y.w));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_float8 _simd_nextafter_f8(simd_float8 x, simd_float8 y);
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y) {
+ return _simd_nextafter_f8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float8 __tg_nextafter(simd_float8 x, simd_float8 y) {
+ return simd_make_float8(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_float16 _simd_nextafter_f16(simd_float16 x, simd_float16 y);
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y) {
+ return _simd_nextafter_f16(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_float16 __tg_nextafter(simd_float16 x, simd_float16 y) {
+ return simd_make_float16(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_nextafter_d2(simd_double2 x, simd_double2 y);
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y) {
+ return _simd_nextafter_d2(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double2 __tg_nextafter(simd_double2 x, simd_double2 y) {
+ return simd_make_double2(nextafter(x.x, y.x), nextafter(x.y, y.y));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 __tg_nextafter(simd_double3 x, simd_double3 y) {
+ return simd_make_double3(__tg_nextafter(simd_make_double4(x), simd_make_double4(y)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX2__
+extern simd_double4 _simd_nextafter_d4(simd_double4 x, simd_double4 y);
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y) {
+ return _simd_nextafter_d4(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double4 __tg_nextafter(simd_double4 x, simd_double4 y) {
+ return simd_make_double4(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+#if SIMD_LIBRARY_VERSION >= 3 && defined __x86_64__ && defined __AVX512F__
+extern simd_double8 _simd_nextafter_d8(simd_double8 x, simd_double8 y);
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y) {
+ return _simd_nextafter_d8(x, y);
+}
+#else
+static inline SIMD_CFUNC simd_double8 __tg_nextafter(simd_double8 x, simd_double8 y) {
+ return simd_make_double8(__tg_nextafter(x.lo, y.lo), __tg_nextafter(x.hi, y.hi));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float2 __tg_fdim(simd_float2 x, simd_float2 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float3 __tg_fdim(simd_float3 x, simd_float3 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float4 __tg_fdim(simd_float4 x, simd_float4 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float8 __tg_fdim(simd_float8 x, simd_float8 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_float16 __tg_fdim(simd_float16 x, simd_float16 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double2 __tg_fdim(simd_double2 x, simd_double2 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double3 __tg_fdim(simd_double3 x, simd_double3 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double4 __tg_fdim(simd_double4 x, simd_double4 y) { return simd_bitselect(x-y, 0, x<y); }
+static inline SIMD_CFUNC simd_double8 __tg_fdim(simd_double8 x, simd_double8 y) { return simd_bitselect(x-y, 0, x<y); }
+
+static inline SIMD_CFUNC simd_float2 __tg_fma(simd_float2 x, simd_float2 y, simd_float2 z) {
+#if defined __arm64__ || defined __ARM_VFPV4__
+ return vfma_f32(z, x, y);
+#else
+ return simd_make_float2(__tg_fma(simd_make_float4_undef(x), simd_make_float4_undef(y), simd_make_float4_undef(z)));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float3 __tg_fma(simd_float3 x, simd_float3 y, simd_float3 z) {
+ return simd_make_float3(__tg_fma(simd_make_float4(x), simd_make_float4(y), simd_make_float4(z)));
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_float4 _simd_fma_f4(simd_float4 x, simd_float4 y, simd_float4 z);
+#endif
+static inline SIMD_CFUNC simd_float4 __tg_fma(simd_float4 x, simd_float4 y, simd_float4 z) {
+#if defined __arm64__ || defined __ARM_VFPV4__
+ return vfmaq_f32(z, x, y);
+#elif (defined __i386__ || defined __x86_64__) && defined __FMA__
+ return _mm_fmadd_ps(x, y, z);
+#elif SIMD_LIBRARY_VERSION >= 3
+ return _simd_fma_f4(x, y, z);
+#else
+ return simd_make_float4(fma(x.x, y.x, z.x), fma(x.y, y.y, z.y), fma(x.z, y.z, z.z), fma(x.w, y.w, z.w));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float8 __tg_fma(simd_float8 x, simd_float8 y, simd_float8 z) {
+#if (defined __i386__ || defined __x86_64__) && defined __FMA__
+ return _mm256_fmadd_ps(x, y, z);
+#else
+ return simd_make_float8(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_float16 __tg_fma(simd_float16 x, simd_float16 y, simd_float16 z) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_fmadd_ps(x, y, z);
+#else
+ return simd_make_float16(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+#if SIMD_LIBRARY_VERSION >= 3
+extern simd_double2 _simd_fma_d2(simd_double2 x, simd_double2 y, simd_double2 z);
+#endif
+static inline SIMD_CFUNC simd_double2 __tg_fma(simd_double2 x, simd_double2 y, simd_double2 z) {
+#if defined __arm64__
+ return vfmaq_f64(z, x, y);
+#elif (defined __i386__ || defined __x86_64__) && defined __FMA__
+ return _mm_fmadd_pd(x, y, z);
+#elif SIMD_LIBRARY_VERSION >= 3
+ return _simd_fma_d2(x, y, z);
+#else
+ return simd_make_double2(fma(x.x, y.x, z.x), fma(x.y, y.y, z.y));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double3 __tg_fma(simd_double3 x, simd_double3 y, simd_double3 z) {
+ return simd_make_double3(__tg_fma(simd_make_double4(x), simd_make_double4(y), simd_make_double4(z)));
+}
+
+static inline SIMD_CFUNC simd_double4 __tg_fma(simd_double4 x, simd_double4 y, simd_double4 z) {
+#if (defined __i386__ || defined __x86_64__) && defined __FMA__
+ return _mm256_fmadd_pd(x, y, z);
+#else
+ return simd_make_double4(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC simd_double8 __tg_fma(simd_double8 x, simd_double8 y, simd_double8 z) {
+#if defined __x86_64__ && defined __AVX512F__
+ return _mm512_fmadd_pd(x, y, z);
+#else
+ return simd_make_double8(__tg_fma(x.lo, y.lo, z.lo), __tg_fma(x.hi, y.hi, z.hi));
+#endif
+}
+
+static inline SIMD_CFUNC float simd_muladd(float x, float y, float z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_float2 simd_muladd(simd_float2 x, simd_float2 y, simd_float2 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_float3 simd_muladd(simd_float3 x, simd_float3 y, simd_float3 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_float4 simd_muladd(simd_float4 x, simd_float4 y, simd_float4 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_float8 simd_muladd(simd_float8 x, simd_float8 y, simd_float8 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_float16 simd_muladd(simd_float16 x, simd_float16 y, simd_float16 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC double simd_muladd(double x, double y, double z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_double2 simd_muladd(simd_double2 x, simd_double2 y, simd_double2 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_double3 simd_muladd(simd_double3 x, simd_double3 y, simd_double3 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_double4 simd_muladd(simd_double4 x, simd_double4 y, simd_double4 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+static inline SIMD_CFUNC simd_double8 simd_muladd(simd_double8 x, simd_double8 y, simd_double8 z) {
+#pragma STDC FP_CONTRACT ON
+ return x*y + z;
+}
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_MATH_HEADER */ \ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/packed.h b/lib/libc/include/aarch64-macos-gnu/simd/packed.h
new file mode 100644
index 0000000000..ddbd861090
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/packed.h
@@ -0,0 +1,1031 @@
+/*! @header
+ * This header defines fixed size vector types with relaxed alignment. For
+ * each vector type defined by <simd/vector_types.h> that is not a 1- or 3-
+ * element vector, there is a corresponding type defined by this header that
+ * requires only the alignment matching that of the underlying scalar type.
+ *
+ * These types should be used to access buffers that may not be sufficiently
+ * aligned to allow them to be accessed using the "normal" simd vector types.
+ * As an example of this usage, suppose that you want to load a vector of
+ * four floats from an array of floats. The type simd_float4 has sixteen byte
+ * alignment, whereas an array of floats has only four byte alignment.
+ * Thus, naively casting a pointer into the array to (simd_float4 *) would
+ * invoke undefined behavior, and likely produce an alignment fault at
+ * runtime. Instead, use the corresponding packed type to load from the array:
+ *
+ * <pre>
+ * @textblock
+ * simd_float4 vector = *(packed_simd_float4 *)&array[i];
+ * // do something with vector ...
+ * @/textblock
+ * </pre>
+ *
+ * It's important to note that the packed_ types are only needed to work with
+ * memory; once the data is loaded, we simply operate on it as usual using
+ * the simd_float4 type, as illustrated above.
+ *
+ * @copyright 2014-2017 Apple, Inc. All rights reserved.
+ * @unsorted */
+
+#ifndef SIMD_PACKED_TYPES
+#define SIMD_PACKED_TYPES
+
+# include <simd/vector_types.h>
+# if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::char2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(1))) char simd_packed_char2;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::char4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(1))) char simd_packed_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::char8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(1))) char simd_packed_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::char16.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) char simd_packed_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::char32.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) char simd_packed_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::char64.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) char simd_packed_char64;
+
+/*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::uchar2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(1))) unsigned char simd_packed_uchar2;
+
+/*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::uchar4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(1))) unsigned char simd_packed_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as simd::packed::uchar8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(1))) unsigned char simd_packed_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::uchar16. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(1))) unsigned char simd_packed_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::uchar32. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(1))) unsigned char simd_packed_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::uchar64. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(64),__aligned__(1))) unsigned char simd_packed_uchar64;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::short2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(2))) short simd_packed_short2;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::short4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) short simd_packed_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::short8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(2))) short simd_packed_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::short16. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(2))) short simd_packed_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::short32. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(2))) short simd_packed_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::ushort2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(2))) unsigned short simd_packed_ushort2;
+
+/*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::ushort4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(2))) unsigned short simd_packed_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::ushort8. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(2))) unsigned short simd_packed_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::ushort16. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(2))) unsigned short simd_packed_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::ushort32. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(32),__aligned__(2))) unsigned short simd_packed_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::int2. The alignment of this type is that of the underlying
+ * scalar element type, so you can use it to load or store from an array of
+ * that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) int simd_packed_int2;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::int4. The alignment of this type is that of the underlying
+ * scalar element type, so you can use it to load or store from an array of
+ * that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) int simd_packed_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::int8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) int simd_packed_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::int16.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) int simd_packed_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::uint2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) unsigned int simd_packed_uint2;
+
+/*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::uint4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) unsigned int simd_packed_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as simd::packed::uint8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) unsigned int simd_packed_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as simd::packed::uint16.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) unsigned int simd_packed_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::float2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) float simd_packed_float2;
+
+/*! @abstract A vector of four 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::float4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) float simd_packed_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as simd::packed::float8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) float simd_packed_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::float16. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef __attribute__((__ext_vector_type__(16),__aligned__(4))) float simd_packed_float16;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::long2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) simd_long1 simd_packed_long2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) simd_long1 simd_packed_long2;
+#endif
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::long4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) simd_long1 simd_packed_long4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) simd_long1 simd_packed_long4;
+#endif
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C++ this type is also available as simd::packed::long8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) simd_long1 simd_packed_long8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) simd_long1 simd_packed_long8;
+#endif
+
+/*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::ulong2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) simd_ulong1 simd_packed_ulong2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) simd_ulong1 simd_packed_ulong2;
+#endif
+
+/*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::ulong4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) simd_ulong1 simd_packed_ulong4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) simd_ulong1 simd_packed_ulong4;
+#endif
+
+/*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as simd::packed::ulong8.
+ * This type is not available in Metal. The alignment of this type is only
+ * that of the underlying scalar element type, so you can use it to load or
+ * store from an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) simd_ulong1 simd_packed_ulong8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) simd_ulong1 simd_packed_ulong8;
+#endif
+
+/*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::double2. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(2),__aligned__(8))) double simd_packed_double2;
+#else
+typedef __attribute__((__ext_vector_type__(2),__aligned__(4))) double simd_packed_double2;
+#endif
+
+/*! @abstract A vector of four 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ and Metal, this type is also available as
+ * simd::packed::double4. The alignment of this type is that of the
+ * underlying scalar element type, so you can use it to load or store from
+ * an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(4),__aligned__(8))) double simd_packed_double4;
+#else
+typedef __attribute__((__ext_vector_type__(4),__aligned__(4))) double simd_packed_double4;
+#endif
+
+/*! @abstract A vector of eight 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C++ this type is also available as
+ * simd::packed::double8. This type is not available in Metal. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+#if defined __LP64__
+typedef __attribute__((__ext_vector_type__(8),__aligned__(8))) double simd_packed_double8;
+#else
+typedef __attribute__((__ext_vector_type__(8),__aligned__(4))) double simd_packed_double8;
+#endif
+
+/* MARK: C++ vector types */
+#if defined __cplusplus
+namespace simd {
+ namespace packed {
+ /*! @abstract A vector of two 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_char2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_char2 char2;
+
+ /*! @abstract A vector of four 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_char4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_char4 char4;
+
+ /*! @abstract A vector of eight 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_char8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_char8 char8;
+
+ /*! @abstract A vector of sixteen 8-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_char16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_char16 char16;
+
+ /*! @abstract A vector of thirty-two 8-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_char32. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_char32 char32;
+
+ /*! @abstract A vector of sixty-four 8-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_char64. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_char64 char64;
+
+ /*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_uchar2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_uchar2 uchar2;
+
+ /*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_uchar4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_uchar4 uchar4;
+
+ /*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_uchar8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_uchar8 uchar8;
+
+ /*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_uchar16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_uchar16 uchar16;
+
+ /*! @abstract A vector of thirty-two 8-bit unsigned integers with
+ * relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_uchar32. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_uchar32 uchar32;
+
+ /*! @abstract A vector of sixty-four 8-bit unsigned integers with
+ * relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_uchar64. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_uchar64 uchar64;
+
+ /*! @abstract A vector of two 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_short2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_short2 short2;
+
+ /*! @abstract A vector of four 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_short4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_short4 short4;
+
+ /*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_short8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_short8 short8;
+
+ /*! @abstract A vector of sixteen 16-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_short16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_short16 short16;
+
+ /*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_short32. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_short32 short32;
+
+ /*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_ushort2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_ushort2 ushort2;
+
+ /*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_ushort4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_ushort4 ushort4;
+
+ /*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_ushort8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_ushort8 ushort8;
+
+ /*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_ushort16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_ushort16 ushort16;
+
+ /*! @abstract A vector of thirty-two 16-bit unsigned integers with
+ * relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_ushort32. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_ushort32 ushort32;
+
+ /*! @abstract A vector of two 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_int2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_int2 int2;
+
+ /*! @abstract A vector of four 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_int4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_int4 int4;
+
+ /*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_int8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_int8 int8;
+
+ /*! @abstract A vector of sixteen 32-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_int16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_int16 int16;
+
+ /*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_uint2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_uint2 uint2;
+
+ /*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_uint4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_uint4 uint4;
+
+ /*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_uint8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_uint8 uint8;
+
+ /*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_uint16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_uint16 uint16;
+
+ /*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_float2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_float2 float2;
+
+ /*! @abstract A vector of four 32-bit floating-point numbers with
+ * relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_float4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_float4 float4;
+
+ /*! @abstract A vector of eight 32-bit floating-point numbers with
+ * relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_float8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_float8 float8;
+
+ /*! @abstract A vector of sixteen 32-bit floating-point numbers with
+ * relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_float16. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_float16 float16;
+
+ /*! @abstract A vector of two 64-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_long2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_long2 long2;
+
+ /*! @abstract A vector of four 64-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_long4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_long4 long4;
+
+ /*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_long8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_long8 long8;
+
+ /*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_ulong2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_ulong2 ulong2;
+
+ /*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_ulong4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_ulong4 ulong4;
+
+ /*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_ulong8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_ulong8 ulong8;
+
+ /*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_double2. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_double2 double2;
+
+ /*! @abstract A vector of four 64-bit floating-point numbers with
+ * relaxed alignment.
+ * @description In C or Objective-C, this type is available as
+ * simd_packed_double4. The alignment of this type is only that of the
+ * underlying scalar element type, so you can use it to load or store
+ * from an array of that type. */
+typedef ::simd_packed_double4 double4;
+
+ /*! @abstract A vector of eight 64-bit floating-point numbers with
+ * relaxed alignment.
+ * @description This type is not available in Metal. In C or
+ * Objective-C, this type is available as simd_packed_double8. The
+ * alignment of this type is only that of the underlying scalar element
+ * type, so you can use it to load or store from an array of that type. */
+typedef ::simd_packed_double8 double8;
+
+ } /* namespace simd::packed:: */
+} /* namespace simd:: */
+#endif /* __cplusplus */
+
+/* MARK: Deprecated vector types */
+/*! @group Deprecated vector types
+ * @discussion These are the original types used by earlier versions of the
+ * simd library; they are provided here for compatability with existing source
+ * files. Use the new ("simd_"-prefixed) types for future development. */
+/*! @abstract A vector of two 8-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_char2
+ * or simd::packed::char2 instead. */
+typedef simd_packed_char2 packed_char2;
+
+/*! @abstract A vector of four 8-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_char4
+ * or simd::packed::char4 instead. */
+typedef simd_packed_char4 packed_char4;
+
+/*! @abstract A vector of eight 8-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_char8
+ * or simd::packed::char8 instead. */
+typedef simd_packed_char8 packed_char8;
+
+/*! @abstract A vector of sixteen 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_char16
+ * or simd::packed::char16 instead. */
+typedef simd_packed_char16 packed_char16;
+
+/*! @abstract A vector of thirty-two 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_char32
+ * or simd::packed::char32 instead. */
+typedef simd_packed_char32 packed_char32;
+
+/*! @abstract A vector of sixty-four 8-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_char64
+ * or simd::packed::char64 instead. */
+typedef simd_packed_char64 packed_char64;
+
+/*! @abstract A vector of two 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uchar2
+ * or simd::packed::uchar2 instead. */
+typedef simd_packed_uchar2 packed_uchar2;
+
+/*! @abstract A vector of four 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uchar4
+ * or simd::packed::uchar4 instead. */
+typedef simd_packed_uchar4 packed_uchar4;
+
+/*! @abstract A vector of eight 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uchar8
+ * or simd::packed::uchar8 instead. */
+typedef simd_packed_uchar8 packed_uchar8;
+
+/*! @abstract A vector of sixteen 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uchar16
+ * or simd::packed::uchar16 instead. */
+typedef simd_packed_uchar16 packed_uchar16;
+
+/*! @abstract A vector of thirty-two 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uchar32
+ * or simd::packed::uchar32 instead. */
+typedef simd_packed_uchar32 packed_uchar32;
+
+/*! @abstract A vector of sixty-four 8-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uchar64
+ * or simd::packed::uchar64 instead. */
+typedef simd_packed_uchar64 packed_uchar64;
+
+/*! @abstract A vector of two 16-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_short2
+ * or simd::packed::short2 instead. */
+typedef simd_packed_short2 packed_short2;
+
+/*! @abstract A vector of four 16-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_short4
+ * or simd::packed::short4 instead. */
+typedef simd_packed_short4 packed_short4;
+
+/*! @abstract A vector of eight 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_short8
+ * or simd::packed::short8 instead. */
+typedef simd_packed_short8 packed_short8;
+
+/*! @abstract A vector of sixteen 16-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_short16
+ * or simd::packed::short16 instead. */
+typedef simd_packed_short16 packed_short16;
+
+/*! @abstract A vector of thirty-two 16-bit signed (twos-complement)
+ * integers with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_short32
+ * or simd::packed::short32 instead. */
+typedef simd_packed_short32 packed_short32;
+
+/*! @abstract A vector of two 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_ushort2
+ * or simd::packed::ushort2 instead. */
+typedef simd_packed_ushort2 packed_ushort2;
+
+/*! @abstract A vector of four 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_ushort4
+ * or simd::packed::ushort4 instead. */
+typedef simd_packed_ushort4 packed_ushort4;
+
+/*! @abstract A vector of eight 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_ushort8
+ * or simd::packed::ushort8 instead. */
+typedef simd_packed_ushort8 packed_ushort8;
+
+/*! @abstract A vector of sixteen 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use
+ * simd_packed_ushort16 or simd::packed::ushort16 instead. */
+typedef simd_packed_ushort16 packed_ushort16;
+
+/*! @abstract A vector of thirty-two 16-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use
+ * simd_packed_ushort32 or simd::packed::ushort32 instead. */
+typedef simd_packed_ushort32 packed_ushort32;
+
+/*! @abstract A vector of two 32-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_int2 or
+ * simd::packed::int2 instead. */
+typedef simd_packed_int2 packed_int2;
+
+/*! @abstract A vector of four 32-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_int4 or
+ * simd::packed::int4 instead. */
+typedef simd_packed_int4 packed_int4;
+
+/*! @abstract A vector of eight 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_int8 or
+ * simd::packed::int8 instead. */
+typedef simd_packed_int8 packed_int8;
+
+/*! @abstract A vector of sixteen 32-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_int16
+ * or simd::packed::int16 instead. */
+typedef simd_packed_int16 packed_int16;
+
+/*! @abstract A vector of two 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uint2
+ * or simd::packed::uint2 instead. */
+typedef simd_packed_uint2 packed_uint2;
+
+/*! @abstract A vector of four 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uint4
+ * or simd::packed::uint4 instead. */
+typedef simd_packed_uint4 packed_uint4;
+
+/*! @abstract A vector of eight 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uint8
+ * or simd::packed::uint8 instead. */
+typedef simd_packed_uint8 packed_uint8;
+
+/*! @abstract A vector of sixteen 32-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_uint16
+ * or simd::packed::uint16 instead. */
+typedef simd_packed_uint16 packed_uint16;
+
+/*! @abstract A vector of two 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_float2
+ * or simd::packed::float2 instead. */
+typedef simd_packed_float2 packed_float2;
+
+/*! @abstract A vector of four 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_float4
+ * or simd::packed::float4 instead. */
+typedef simd_packed_float4 packed_float4;
+
+/*! @abstract A vector of eight 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_float8
+ * or simd::packed::float8 instead. */
+typedef simd_packed_float8 packed_float8;
+
+/*! @abstract A vector of sixteen 32-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_float16
+ * or simd::packed::float16 instead. */
+typedef simd_packed_float16 packed_float16;
+
+/*! @abstract A vector of two 64-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_long2
+ * or simd::packed::long2 instead. */
+typedef simd_packed_long2 packed_long2;
+
+/*! @abstract A vector of four 64-bit signed (twos-complement) integers with
+ * relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_long4
+ * or simd::packed::long4 instead. */
+typedef simd_packed_long4 packed_long4;
+
+/*! @abstract A vector of eight 64-bit signed (twos-complement) integers
+ * with relaxed alignment.
+ * @description This type is deprecated; you should use simd_packed_long8
+ * or simd::packed::long8 instead. */
+typedef simd_packed_long8 packed_long8;
+
+/*! @abstract A vector of two 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_ulong2
+ * or simd::packed::ulong2 instead. */
+typedef simd_packed_ulong2 packed_ulong2;
+
+/*! @abstract A vector of four 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_ulong4
+ * or simd::packed::ulong4 instead. */
+typedef simd_packed_ulong4 packed_ulong4;
+
+/*! @abstract A vector of eight 64-bit unsigned integers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_ulong8
+ * or simd::packed::ulong8 instead. */
+typedef simd_packed_ulong8 packed_ulong8;
+
+/*! @abstract A vector of two 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_double2
+ * or simd::packed::double2 instead. */
+typedef simd_packed_double2 packed_double2;
+
+/*! @abstract A vector of four 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_double4
+ * or simd::packed::double4 instead. */
+typedef simd_packed_double4 packed_double4;
+
+/*! @abstract A vector of eight 64-bit floating-point numbers with relaxed
+ * alignment.
+ * @description This type is deprecated; you should use simd_packed_double8
+ * or simd::packed::double8 instead. */
+typedef simd_packed_double8 packed_double8;
+
+# endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif \ No newline at end of file
diff --git a/lib/libc/include/aarch64-macos-gnu/simd/quaternion.h b/lib/libc/include/aarch64-macos-gnu/simd/quaternion.h
new file mode 100644
index 0000000000..b7c5e2909d
--- /dev/null
+++ b/lib/libc/include/aarch64-macos-gnu/simd/quaternion.h
@@ -0,0 +1,1194 @@
+/*! @header
+ * This header defines functions for constructing and using quaternions.
+ * @copyright 2015-2016 Apple, Inc. All rights reserved.
+ * @unsorted */
+
+#ifndef SIMD_QUATERNIONS
+#define SIMD_QUATERNIONS
+
+#include <simd/base.h>
+#if SIMD_COMPILER_HAS_REQUIRED_FEATURES
+#include <simd/vector.h>
+#include <simd/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* MARK: - C and Objective-C float interfaces */
+
+/*! @abstract Constructs a quaternion from four scalar values.
+ *
+ * @param ix The first component of the imaginary (vector) part.
+ * @param iy The second component of the imaginary (vector) part.
+ * @param iz The third component of the imaginary (vector) part.
+ *
+ * @param r The real (scalar) part. */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float ix, float iy, float iz, float r) {
+ return (simd_quatf){ { ix, iy, iz, r } };
+}
+
+/*! @abstract Constructs a quaternion from an array of four scalars.
+ *
+ * @discussion Note that the imaginary part of the quaternion comes from
+ * array elements 0, 1, and 2, and the real part comes from element 3. */
+static inline SIMD_NONCONST simd_quatf simd_quaternion(const float xyzr[4]) {
+ return (simd_quatf){ *(const simd_packed_float4 *)xyzr };
+}
+
+/*! @abstract Constructs a quaternion from a four-element vector.
+ *
+ * @discussion Note that the imaginary (vector) part of the quaternion comes
+ * from lanes 0, 1, and 2 of the vector, and the real (scalar) part comes from
+ * lane 3. */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(simd_float4 xyzr) {
+ return (simd_quatf){ xyzr };
+}
+
+/*! @abstract Constructs a quaternion that rotates by `angle` radians about
+ * `axis`. */
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float angle, simd_float3 axis);
+
+/*! @abstract Construct a quaternion that rotates from one vector to another.
+ *
+ * @param from A normalized three-element vector.
+ * @param to A normalized three-element vector.
+ *
+ * @discussion The rotation axis is `simd_cross(from, to)`. If `from` and
+ * `to` point in opposite directions (to within machine precision), an
+ * arbitrary rotation axis is chosen, and the angle is pi radians. */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3 from, simd_float3 to);
+
+/*! @abstract Construct a quaternion from a 3x3 rotation `matrix`.
+ *
+ * @discussion If `matrix` is not orthogonal with determinant 1, the result
+ * is undefined. */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3x3 matrix);
+
+/*! @abstract Construct a quaternion from a 4x4 rotation `matrix`.
+ *
+ * @discussion The last row and column of the matrix are ignored. This
+ * function is equivalent to calling simd_quaternion with the upper-left 3x3
+ * submatrix . */
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float4x4 matrix);
+
+/*! @abstract The real (scalar) part of the quaternion `q`. */
+static inline SIMD_CFUNC float simd_real(simd_quatf q) {
+ return q.vector.w;
+}
+
+/*! @abstract The imaginary (vector) part of the quaternion `q`. */
+static inline SIMD_CFUNC simd_float3 simd_imag(simd_quatf q) {
+ return q.vector.xyz;
+}
+
+/*! @abstract The angle (in radians) of rotation represented by `q`. */
+static inline SIMD_CFUNC float simd_angle(simd_quatf q);
+
+/*! @abstract The normalized axis (a 3-element vector) around which the
+ * action of the quaternion `q` rotates. */
+static inline SIMD_CFUNC simd_float3 simd_axis(simd_quatf q);
+
+/*! @abstract The sum of the quaternions `p` and `q`. */
+static inline SIMD_CFUNC simd_quatf simd_add(simd_quatf p, simd_quatf q);
+
+/*! @abstract The difference of the quaternions `p` and `q`. */
+static inline SIMD_CFUNC simd_quatf simd_sub(simd_quatf p, simd_quatf q);
+
+/*! @abstract The product of the quaternions `p` and `q`. */
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf p, simd_quatf q);
+
+/*! @abstract The quaternion `q` scaled by the real value `a`. */
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf q, float a);
+
+/*! @abstract The quaternion `q` scaled by the real value `a`. */
+static inline SIMD_CFUNC simd_quatf simd_mul(float a, simd_quatf q);
+
+/*! @abstract The conjugate of the quaternion `q`. */
+static inline SIMD_CFUNC simd_quatf simd_conjugate(simd_quatf q);
+
+/*! @abstract The (multiplicative) inverse of the quaternion `q`. */
+static inline SIMD_CFUNC simd_quatf simd_inverse(simd_quatf q);
+
+/*! @abstract The negation (additive inverse) of the quaternion `q`. */
+static inline SIMD_CFUNC simd_quatf simd_negate(simd_quatf q);
+
+/*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ * four-dimensional vectors. */
+static inline SIMD_CFUNC float simd_dot(simd_quatf p, simd_quatf q);
+
+/*! @abstract The length of the quaternion `q`. */
+static inline SIMD_CFUNC float simd_length(simd_quatf q);
+
+/*! @abstract The unit quaternion obtained by normalizing `q`. */
+static inline SIMD_CFUNC simd_quatf simd_normalize(simd_quatf q);
+
+/*! @abstract Rotates the vector `v` by the quaternion `q`. */
+static inline SIMD_CFUNC simd_float3 simd_act(simd_quatf q, simd_float3 v);
+
+/*! @abstract Logarithm of the quaternion `q`.
+ * @discussion Do not call this function directly; use `log(q)` instead.
+ *
+ * We can write a quaternion `q` in the form: `r(cos(t) + sin(t)v)` where
+ * `r` is the length of `q`, `t` is an angle, and `v` is a unit 3-vector.
+ * The logarithm of `q` is `log(r) + tv`, just like the logarithm of the
+ * complex number `r*(cos(t) + i sin(t))` is `log(r) + it`.
+ *
+ * Note that this function is not robust against poorly-scaled non-unit
+ * quaternions, because it is primarily used for spline interpolation of
+ * unit quaternions. If you need to compute a robust logarithm of general
+ * quaternions, you can use the following approach:
+ *
+ * scale = simd_reduce_max(simd_abs(q.vector));
+ * logq = log(simd_recip(scale)*q);
+ * logq.real += log(scale);
+ * return logq; */
+static SIMD_NOINLINE simd_quatf __tg_log(simd_quatf q);
+
+/*! @abstract Inverse of `log( )`; the exponential map on quaternions.
+ * @discussion Do not call this function directly; use `exp(q)` instead. */
+static SIMD_NOINLINE simd_quatf __tg_exp(simd_quatf q);
+
+/*! @abstract Spherical linear interpolation along the shortest arc between
+ * quaternions `q0` and `q1`. */
+static SIMD_NOINLINE simd_quatf simd_slerp(simd_quatf q0, simd_quatf q1, float t);
+
+/*! @abstract Spherical linear interpolation along the longest arc between
+ * quaternions `q0` and `q1`. */
+static SIMD_NOINLINE simd_quatf simd_slerp_longest(simd_quatf q0, simd_quatf q1, float t);
+
+/*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ * @discussion The function interpolates between q1 and q2. q0 is the left
+ * endpoint of the previous interval, and q3 is the right endpoint of the next
+ * interval. Use this function to smoothly interpolate between a sequence of
+ * rotations. */
+static SIMD_NOINLINE simd_quatf simd_spline(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t);
+
+/*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ * @discussion The function treats q0 ... q3 as control points and uses slerp
+ * in place of lerp in the De Castlejeau algorithm. The endpoints of
+ * interpolation are thus q0 and q3, and the curve will not generally pass
+ * through q1 or q2. Note that the convex hull property of "standard" Bezier
+ * curve does not hold on the sphere. */
+static SIMD_NOINLINE simd_quatf simd_bezier(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t);
+
+#ifdef __cplusplus
+} /* extern "C" */
+/* MARK: - C++ float interfaces */
+
+namespace simd {
+ struct quatf : ::simd_quatf {
+ /*! @abstract The identity quaternion. */
+ quatf( ) : ::simd_quatf(::simd_quaternion((float4){0,0,0,1})) { }
+
+ /*! @abstract Constructs a C++ quaternion from a C quaternion. */
+ quatf(::simd_quatf q) : ::simd_quatf(q) { }
+
+ /*! @abstract Constructs a quaternion from components. */
+ quatf(float ix, float iy, float iz, float r) : ::simd_quatf(::simd_quaternion(ix, iy, iz, r)) { }
+
+ /*! @abstract Constructs a quaternion from an array of scalars. */
+ quatf(const float xyzr[4]) : ::simd_quatf(::simd_quaternion(xyzr)) { }
+
+ /*! @abstract Constructs a quaternion from a vector. */
+ quatf(float4 xyzr) : ::simd_quatf(::simd_quaternion(xyzr)) { }
+
+ /*! @abstract Quaternion representing rotation about `axis` by `angle`
+ * radians. */
+ quatf(float angle, float3 axis) : ::simd_quatf(::simd_quaternion(angle, axis)) { }
+
+ /*! @abstract Quaternion that rotates `from` into `to`. */
+ quatf(float3 from, float3 to) : ::simd_quatf(::simd_quaternion(from, to)) { }
+
+ /*! @abstract Constructs a quaternion from a rotation matrix. */
+ quatf(::simd_float3x3 matrix) : ::simd_quatf(::simd_quaternion(matrix)) { }
+
+ /*! @abstract Constructs a quaternion from a rotation matrix. */
+ quatf(::simd_float4x4 matrix) : ::simd_quatf(::simd_quaternion(matrix)) { }
+
+ /*! @abstract The real (scalar) part of the quaternion. */
+ float real(void) const { return ::simd_real(*this); }
+
+ /*! @abstract The imaginary (vector) part of the quaternion. */
+ float3 imag(void) const { return ::simd_imag(*this); }
+
+ /*! @abstract The angle the quaternion rotates by. */
+ float angle(void) const { return ::simd_angle(*this); }
+
+ /*! @abstract The axis the quaternion rotates about. */
+ float3 axis(void) const { return ::simd_axis(*this); }
+
+ /*! @abstract The length of the quaternion. */
+ float length(void) const { return ::simd_length(*this); }
+
+ /*! @abstract Act on the vector `v` by rotation. */
+ float3 operator()(const ::simd_float3 v) const { return ::simd_act(*this, v); }
+ };
+
+ static SIMD_CPPFUNC quatf operator+(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_add(p, q); }
+ static SIMD_CPPFUNC quatf operator-(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_sub(p, q); }
+ static SIMD_CPPFUNC quatf operator-(const ::simd_quatf p) { return ::simd_negate(p); }
+ static SIMD_CPPFUNC quatf operator*(const float r, const ::simd_quatf p) { return ::simd_mul(r, p); }
+ static SIMD_CPPFUNC quatf operator*(const ::simd_quatf p, const float r) { return ::simd_mul(p, r); }
+ static SIMD_CPPFUNC quatf operator*(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_mul(p, q); }
+ static SIMD_CPPFUNC quatf operator/(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_mul(p, ::simd_inverse(q)); }
+ static SIMD_CPPFUNC quatf operator+=(quatf &p, const ::simd_quatf q) { return p = p+q; }
+ static SIMD_CPPFUNC quatf operator-=(quatf &p, const ::simd_quatf q) { return p = p-q; }
+ static SIMD_CPPFUNC quatf operator*=(quatf &p, const float r) { return p = p*r; }
+ static SIMD_CPPFUNC quatf operator*=(quatf &p, const ::simd_quatf q) { return p = p*q; }
+ static SIMD_CPPFUNC quatf operator/=(quatf &p, const ::simd_quatf q) { return p = p/q; }
+
+ /*! @abstract The conjugate of the quaternion `q`. */
+ static SIMD_CPPFUNC quatf conjugate(const ::simd_quatf p) { return ::simd_conjugate(p); }
+
+ /*! @abstract The (multiplicative) inverse of the quaternion `q`. */
+ static SIMD_CPPFUNC quatf inverse(const ::simd_quatf p) { return ::simd_inverse(p); }
+
+ /*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ * four-dimensional vectors. */
+ static SIMD_CPPFUNC float dot(const ::simd_quatf p, const ::simd_quatf q) { return ::simd_dot(p, q); }
+
+ /*! @abstract The unit quaternion obtained by normalizing `q`. */
+ static SIMD_CPPFUNC quatf normalize(const ::simd_quatf p) { return ::simd_normalize(p); }
+
+ /*! @abstract logarithm of the quaternion `q`. */
+ static SIMD_CPPFUNC quatf log(const ::simd_quatf q) { return ::__tg_log(q); }
+
+ /*! @abstract exponential map of quaterion `q`. */
+ static SIMD_CPPFUNC quatf exp(const ::simd_quatf q) { return ::__tg_exp(q); }
+
+ /*! @abstract Spherical linear interpolation along the shortest arc between
+ * quaternions `q0` and `q1`. */
+ static SIMD_CPPFUNC quatf slerp(const ::simd_quatf p0, const ::simd_quatf p1, float t) { return ::simd_slerp(p0, p1, t); }
+
+ /*! @abstract Spherical linear interpolation along the longest arc between
+ * quaternions `q0` and `q1`. */
+ static SIMD_CPPFUNC quatf slerp_longest(const ::simd_quatf p0, const ::simd_quatf p1, float t) { return ::simd_slerp_longest(p0, p1, t); }
+
+ /*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ * @discussion The function interpolates between q1 and q2. q0 is the left
+ * endpoint of the previous interval, and q3 is the right endpoint of the next
+ * interval. Use this function to smoothly interpolate between a sequence of
+ * rotations. */
+ static SIMD_CPPFUNC quatf spline(const ::simd_quatf p0, const ::simd_quatf p1, const ::simd_quatf p2, const ::simd_quatf p3, float t) { return ::simd_spline(p0, p1, p2, p3, t); }
+
+ /*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ * @discussion The function treats q0 ... q3 as control points and uses slerp
+ * in place of lerp in the De Castlejeau algorithm. The endpoints of
+ * interpolation are thus q0 and q3, and the curve will not generally pass
+ * through q1 or q2. Note that the convex hull property of "standard" Bezier
+ * curve does not hold on the sphere. */
+ static SIMD_CPPFUNC quatf bezier(const ::simd_quatf p0, const ::simd_quatf p1, const ::simd_quatf p2, const ::simd_quatf p3, float t) { return ::simd_bezier(p0, p1, p2, p3, t); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+/* MARK: - float implementations */
+
+#include <simd/math.h>
+#include <simd/geometry.h>
+
+/* tg_promote is implementation gobbledygook that enables the compile-time
+ * dispatching in tgmath.h to work its magic. */
+static simd_quatf __attribute__((__overloadable__)) __tg_promote(simd_quatf);
+
+/*! @abstract Constructs a quaternion from imaginary and real parts.
+ * @discussion This function is hidden behind an underscore to avoid confusion
+ * with the angle-axis constructor. */
+static inline SIMD_CFUNC simd_quatf _simd_quaternion(simd_float3 imag, float real) {
+ return simd_quaternion(simd_make_float4(imag, real));
+}
+
+static inline SIMD_CFUNC simd_quatf simd_quaternion(float angle, simd_float3 axis) {
+ return _simd_quaternion(sin(angle/2) * axis, cos(angle/2));
+}
+
+static inline SIMD_CFUNC float simd_angle(simd_quatf q) {
+ return 2*atan2(simd_length(q.vector.xyz), q.vector.w);
+}
+
+static inline SIMD_CFUNC simd_float3 simd_axis(simd_quatf q) {
+ return simd_normalize(q.vector.xyz);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_add(simd_quatf p, simd_quatf q) {
+ return simd_quaternion(p.vector + q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_sub(simd_quatf p, simd_quatf q) {
+ return simd_quaternion(p.vector - q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf p, simd_quatf q) {
+ #pragma STDC FP_CONTRACT ON
+ return simd_quaternion((p.vector.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+ p.vector.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5)) +
+ (p.vector.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6) +
+ p.vector.w * q.vector));
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(simd_quatf q, float a) {
+ return simd_quaternion(a * q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_mul(float a, simd_quatf q) {
+ return simd_mul(q,a);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_conjugate(simd_quatf q) {
+ return simd_quaternion(q.vector * (simd_float4){-1,-1,-1, 1});
+}
+
+static inline SIMD_CFUNC simd_quatf simd_inverse(simd_quatf q) {
+ return simd_quaternion(simd_conjugate(q).vector * simd_recip(simd_length_squared(q.vector)));
+}
+
+static inline SIMD_CFUNC simd_quatf simd_negate(simd_quatf q) {
+ return simd_quaternion(-q.vector);
+}
+
+static inline SIMD_CFUNC float simd_dot(simd_quatf p, simd_quatf q) {
+ return simd_dot(p.vector, q.vector);
+}
+
+static inline SIMD_CFUNC float simd_length(simd_quatf q) {
+ return simd_length(q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatf simd_normalize(simd_quatf q) {
+ float length_squared = simd_length_squared(q.vector);
+ if (length_squared == 0) {
+ return simd_quaternion((simd_float4){0,0,0,1});
+ }
+ return simd_quaternion(q.vector * simd_rsqrt(length_squared));
+}
+
+#if defined __arm__ || defined __arm64__
+/*! @abstract Multiplies the vector `v` by the quaternion `q`.
+ *
+ * @discussion This IS NOT the action of `q` on `v` (i.e. this is not rotation
+ * by `q`. That operation is provided by `simd_act(q, v)`. This function is an
+ * implementation detail and you should not call it directly. It may be
+ * removed or modified in future versions of the simd module. */
+static inline SIMD_CFUNC simd_quatf _simd_mul_vq(simd_float3 v, simd_quatf q) {
+ #pragma STDC FP_CONTRACT ON
+ return simd_quaternion(v.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+ v.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5) +
+ v.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6));
+}
+#endif
+
+static inline SIMD_CFUNC simd_float3 simd_act(simd_quatf q, simd_float3 v) {
+#if defined __arm__ || defined __arm64__
+ return simd_mul(q, _simd_mul_vq(v, simd_conjugate(q))).vector.xyz;
+#else
+ #pragma STDC FP_CONTRACT ON
+ simd_float3 t = 2*simd_cross(simd_imag(q),v);
+ return v + simd_real(q)*t + simd_cross(simd_imag(q), t);
+#endif
+}
+
+static SIMD_NOINLINE simd_quatf __tg_log(simd_quatf q) {
+ float real = __tg_log(simd_length_squared(q.vector))/2;
+ if (simd_equal(simd_imag(q), 0)) return _simd_quaternion(0, real);
+ simd_float3 imag = __tg_acos(simd_real(q)/simd_length(q)) * simd_normalize(simd_imag(q));
+ return _simd_quaternion(imag, real);
+}
+
+static SIMD_NOINLINE simd_quatf __tg_exp(simd_quatf q) {
+ // angle is actually *twice* the angle of the rotation corresponding to
+ // the resulting quaternion, which is why we don't simply use the (angle,
+ // axis) constructor to generate `unit`.
+ float angle = simd_length(simd_imag(q));
+ if (angle == 0) return _simd_quaternion(0, exp(simd_real(q)));
+ simd_float3 axis = simd_normalize(simd_imag(q));
+ simd_quatf unit = _simd_quaternion(sin(angle)*axis, cosf(angle));
+ return simd_mul(exp(simd_real(q)), unit);
+}
+
+/*! @abstract Implementation detail of the `simd_quaternion(from, to)`
+ * initializer.
+ *
+ * @discussion Computes the quaternion rotation `from` to `to` if they are
+ * separated by less than 90 degrees. Not numerically stable for larger
+ * angles. This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static inline SIMD_CFUNC simd_quatf _simd_quaternion_reduced(simd_float3 from, simd_float3 to) {
+ simd_float3 half = simd_normalize(from + to);
+ return _simd_quaternion(simd_cross(from, half), simd_dot(from, half));
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3 from, simd_float3 to) {
+
+ // If the angle between from and to is not too big, we can compute the
+ // rotation accurately using a simple implementation.
+ if (simd_dot(from, to) >= 0) {
+ return _simd_quaternion_reduced(from, to);
+ }
+
+ // Because from and to are more than 90 degrees apart, we compute the
+ // rotation in two stages (from -> half), (half -> to) to preserve numerical
+ // accuracy.
+ simd_float3 half = from + to;
+
+ if (simd_length_squared(half) == 0) {
+ // half is nearly zero, so from and to point in nearly opposite directions
+ // and the rotation is numerically underspecified. Pick an axis orthogonal
+ // to the vectors, and use an angle of pi radians.
+ simd_float3 abs_from = simd_abs(from);
+ if (abs_from.x <= abs_from.y && abs_from.x <= abs_from.z)
+ return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){1,0,0})), 0.f);
+ else if (abs_from.y <= abs_from.z)
+ return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){0,1,0})), 0.f);
+ else
+ return _simd_quaternion(simd_normalize(simd_cross(from, (simd_float3){0,0,1})), 0.f);
+ }
+
+ // Compute the two-step rotation. */
+ half = simd_normalize(half);
+ return simd_mul(_simd_quaternion_reduced(from, half),
+ _simd_quaternion_reduced(half, to));
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float3x3 matrix) {
+ const simd_float3 *mat = matrix.columns;
+ float trace = mat[0][0] + mat[1][1] + mat[2][2];
+ if (trace >= 0.0) {
+ float r = 2*sqrt(1 + trace);
+ float rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]),
+ rinv*(mat[0][1] - mat[1][0]),
+ r/4);
+ } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+ float r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+ float rinv = simd_recip(r);
+ return simd_quaternion(r/4,
+ rinv*(mat[0][1] + mat[1][0]),
+ rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] - mat[2][1]));
+ } else if (mat[1][1] >= mat[2][2]) {
+ float r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+ float rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+ r/4,
+ rinv*(mat[1][2] + mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]));
+ } else {
+ float r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+ float rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] + mat[2][1]),
+ r/4,
+ rinv*(mat[0][1] - mat[1][0]));
+ }
+}
+
+static SIMD_NOINLINE simd_quatf simd_quaternion(simd_float4x4 matrix) {
+ const simd_float4 *mat = matrix.columns;
+ float trace = mat[0][0] + mat[1][1] + mat[2][2];
+ if (trace >= 0.0) {
+ float r = 2*sqrt(1 + trace);
+ float rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]),
+ rinv*(mat[0][1] - mat[1][0]),
+ r/4);
+ } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+ float r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+ float rinv = simd_recip(r);
+ return simd_quaternion(r/4,
+ rinv*(mat[0][1] + mat[1][0]),
+ rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] - mat[2][1]));
+ } else if (mat[1][1] >= mat[2][2]) {
+ float r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+ float rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+ r/4,
+ rinv*(mat[1][2] + mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]));
+ } else {
+ float r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+ float rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] + mat[2][1]),
+ r/4,
+ rinv*(mat[0][1] - mat[1][0]));
+ }
+}
+
+/*! @abstract The angle between p and q interpreted as 4-dimensional vectors.
+ *
+ * @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_NOINLINE float _simd_angle(simd_quatf p, simd_quatf q) {
+ return 2*atan2(simd_length(p.vector - q.vector), simd_length(p.vector + q.vector));
+}
+
+/*! @abstract sin(x)/x.
+ *
+ * @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_CFUNC float _simd_sinc(float x) {
+ if (x == 0) return 1;
+ return sin(x)/x;
+}
+
+/*! @abstract Spherical lerp between q0 and q1.
+ *
+ * @discussion This function may interpolate along either the longer or
+ * shorter path between q0 and q1; it is used as an implementation detail
+ * in `simd_slerp` and `simd_slerp_longest`; you should use those functions
+ * instead of calling this directly. */
+static SIMD_NOINLINE simd_quatf _simd_slerp_internal(simd_quatf q0, simd_quatf q1, float t) {
+ float s = 1 - t;
+ float a = _simd_angle(q0, q1);
+ float r = simd_recip(_simd_sinc(a));
+ return simd_normalize(simd_quaternion(_simd_sinc(s*a)*r*s*q0.vector + _simd_sinc(t*a)*r*t*q1.vector));
+}
+
+static SIMD_NOINLINE simd_quatf simd_slerp(simd_quatf q0, simd_quatf q1, float t) {
+ if (simd_dot(q0, q1) >= 0)
+ return _simd_slerp_internal(q0, q1, t);
+ return _simd_slerp_internal(q0, simd_negate(q1), t);
+}
+
+static SIMD_NOINLINE simd_quatf simd_slerp_longest(simd_quatf q0, simd_quatf q1, float t) {
+ if (simd_dot(q0, q1) >= 0)
+ return _simd_slerp_internal(q0, simd_negate(q1), t);
+ return _simd_slerp_internal(q0, q1, t);
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_NOINLINE simd_quatf _simd_intermediate(simd_quatf q0, simd_quatf q1, simd_quatf q2) {
+ simd_quatf p0 = __tg_log(simd_mul(q0, simd_inverse(q1)));
+ simd_quatf p2 = __tg_log(simd_mul(q2, simd_inverse(q1)));
+ return simd_normalize(simd_mul(q1, __tg_exp(simd_mul(-0.25, simd_add(p0,p2)))));
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_NOINLINE simd_quatf _simd_squad(simd_quatf q0, simd_quatf qa, simd_quatf qb, simd_quatf q1, float t) {
+ simd_quatf r0 = _simd_slerp_internal(q0, q1, t);
+ simd_quatf r1 = _simd_slerp_internal(qa, qb, t);
+ return _simd_slerp_internal(r0, r1, 2*t*(1 - t));
+}
+
+static SIMD_NOINLINE simd_quatf simd_spline(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t) {
+ simd_quatf qa = _simd_intermediate(q0, q1, q2);
+ simd_quatf qb = _simd_intermediate(q1, q2, q3);
+ return _simd_squad(q1, qa, qb, q2, t);
+}
+
+static SIMD_NOINLINE simd_quatf simd_bezier(simd_quatf q0, simd_quatf q1, simd_quatf q2, simd_quatf q3, float t) {
+ simd_quatf q01 = _simd_slerp_internal(q0, q1, t);
+ simd_quatf q12 = _simd_slerp_internal(q1, q2, t);
+ simd_quatf q23 = _simd_slerp_internal(q2, q3, t);
+ simd_quatf q012 = _simd_slerp_internal(q01, q12, t);
+ simd_quatf q123 = _simd_slerp_internal(q12, q23, t);
+ return _simd_slerp_internal(q012, q123, t);
+}
+
+/* MARK: - C and Objective-C double interfaces */
+
+/*! @abstract Constructs a quaternion from four scalar values.
+ *
+ * @param ix The first component of the imaginary (vector) part.
+ * @param iy The second component of the imaginary (vector) part.
+ * @param iz The third component of the imaginary (vector) part.
+ *
+ * @param r The real (scalar) part. */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double ix, double iy, double iz, double r) {
+ return (simd_quatd){ { ix, iy, iz, r } };
+}
+
+/*! @abstract Constructs a quaternion from an array of four scalars.
+ *
+ * @discussion Note that the imaginary part of the quaternion comes from
+ * array elements 0, 1, and 2, and the real part comes from element 3. */
+static inline SIMD_NONCONST simd_quatd simd_quaternion(const double xyzr[4]) {
+ return (simd_quatd){ *(const simd_packed_double4 *)xyzr };
+}
+
+/*! @abstract Constructs a quaternion from a four-element vector.
+ *
+ * @discussion Note that the imaginary (vector) part of the quaternion comes
+ * from lanes 0, 1, and 2 of the vector, and the real (scalar) part comes from
+ * lane 3. */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(simd_double4 xyzr) {
+ return (simd_quatd){ xyzr };
+}
+
+/*! @abstract Constructs a quaternion that rotates by `angle` radians about
+ * `axis`. */
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double angle, simd_double3 axis);
+
+/*! @abstract Construct a quaternion that rotates from one vector to another.
+ *
+ * @param from A normalized three-element vector.
+ * @param to A normalized three-element vector.
+ *
+ * @discussion The rotation axis is `simd_cross(from, to)`. If `from` and
+ * `to` point in opposite directions (to within machine precision), an
+ * arbitrary rotation axis is chosen, and the angle is pi radians. */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3 from, simd_double3 to);
+
+/*! @abstract Construct a quaternion from a 3x3 rotation `matrix`.
+ *
+ * @discussion If `matrix` is not orthogonal with determinant 1, the result
+ * is undefined. */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3x3 matrix);
+
+/*! @abstract Construct a quaternion from a 4x4 rotation `matrix`.
+ *
+ * @discussion The last row and column of the matrix are ignored. This
+ * function is equivalent to calling simd_quaternion with the upper-left 3x3
+ * submatrix . */
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double4x4 matrix);
+
+/*! @abstract The real (scalar) part of the quaternion `q`. */
+static inline SIMD_CFUNC double simd_real(simd_quatd q) {
+ return q.vector.w;
+}
+
+/*! @abstract The imaginary (vector) part of the quaternion `q`. */
+static inline SIMD_CFUNC simd_double3 simd_imag(simd_quatd q) {
+ return q.vector.xyz;
+}
+
+/*! @abstract The angle (in radians) of rotation represented by `q`. */
+static inline SIMD_CFUNC double simd_angle(simd_quatd q);
+
+/*! @abstract The normalized axis (a 3-element vector) around which the
+ * action of the quaternion `q` rotates. */
+static inline SIMD_CFUNC simd_double3 simd_axis(simd_quatd q);
+
+/*! @abstract The sum of the quaternions `p` and `q`. */
+static inline SIMD_CFUNC simd_quatd simd_add(simd_quatd p, simd_quatd q);
+
+/*! @abstract The difference of the quaternions `p` and `q`. */
+static inline SIMD_CFUNC simd_quatd simd_sub(simd_quatd p, simd_quatd q);
+
+/*! @abstract The product of the quaternions `p` and `q`. */
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd p, simd_quatd q);
+
+/*! @abstract The quaternion `q` scaled by the real value `a`. */
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd q, double a);
+
+/*! @abstract The quaternion `q` scaled by the real value `a`. */
+static inline SIMD_CFUNC simd_quatd simd_mul(double a, simd_quatd q);
+
+/*! @abstract The conjugate of the quaternion `q`. */
+static inline SIMD_CFUNC simd_quatd simd_conjugate(simd_quatd q);
+
+/*! @abstract The (multiplicative) inverse of the quaternion `q`. */
+static inline SIMD_CFUNC simd_quatd simd_inverse(simd_quatd q);
+
+/*! @abstract The negation (additive inverse) of the quaternion `q`. */
+static inline SIMD_CFUNC simd_quatd simd_negate(simd_quatd q);
+
+/*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ * four-dimensional vectors. */
+static inline SIMD_CFUNC double simd_dot(simd_quatd p, simd_quatd q);
+
+/*! @abstract The length of the quaternion `q`. */
+static inline SIMD_CFUNC double simd_length(simd_quatd q);
+
+/*! @abstract The unit quaternion obtained by normalizing `q`. */
+static inline SIMD_CFUNC simd_quatd simd_normalize(simd_quatd q);
+
+/*! @abstract Rotates the vector `v` by the quaternion `q`. */
+static inline SIMD_CFUNC simd_double3 simd_act(simd_quatd q, simd_double3 v);
+
+/*! @abstract Logarithm of the quaternion `q`.
+ * @discussion Do not call this function directly; use `log(q)` instead.
+ *
+ * We can write a quaternion `q` in the form: `r(cos(t) + sin(t)v)` where
+ * `r` is the length of `q`, `t` is an angle, and `v` is a unit 3-vector.
+ * The logarithm of `q` is `log(r) + tv`, just like the logarithm of the
+ * complex number `r*(cos(t) + i sin(t))` is `log(r) + it`.
+ *
+ * Note that this function is not robust against poorly-scaled non-unit
+ * quaternions, because it is primarily used for spline interpolation of
+ * unit quaternions. If you need to compute a robust logarithm of general
+ * quaternions, you can use the following approach:
+ *
+ * scale = simd_reduce_max(simd_abs(q.vector));
+ * logq = log(simd_recip(scale)*q);
+ * logq.real += log(scale);
+ * return logq; */
+static SIMD_NOINLINE simd_quatd __tg_log(simd_quatd q);
+
+/*! @abstract Inverse of `log( )`; the exponential map on quaternions.
+ * @discussion Do not call this function directly; use `exp(q)` instead. */
+static SIMD_NOINLINE simd_quatd __tg_exp(simd_quatd q);
+
+/*! @abstract Spherical linear interpolation along the shortest arc between
+ * quaternions `q0` and `q1`. */
+static SIMD_NOINLINE simd_quatd simd_slerp(simd_quatd q0, simd_quatd q1, double t);
+
+/*! @abstract Spherical linear interpolation along the longest arc between
+ * quaternions `q0` and `q1`. */
+static SIMD_NOINLINE simd_quatd simd_slerp_longest(simd_quatd q0, simd_quatd q1, double t);
+
+/*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ * @discussion The function interpolates between q1 and q2. q0 is the left
+ * endpoint of the previous interval, and q3 is the right endpoint of the next
+ * interval. Use this function to smoothly interpolate between a sequence of
+ * rotations. */
+static SIMD_NOINLINE simd_quatd simd_spline(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t);
+
+/*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ * @discussion The function treats q0 ... q3 as control points and uses slerp
+ * in place of lerp in the De Castlejeau algorithm. The endpoints of
+ * interpolation are thus q0 and q3, and the curve will not generally pass
+ * through q1 or q2. Note that the convex hull property of "standard" Bezier
+ * curve does not hold on the sphere. */
+static SIMD_NOINLINE simd_quatd simd_bezier(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t);
+
+#ifdef __cplusplus
+} /* extern "C" */
+/* MARK: - C++ double interfaces */
+
+namespace simd {
+ struct quatd : ::simd_quatd {
+ /*! @abstract The identity quaternion. */
+ quatd( ) : ::simd_quatd(::simd_quaternion((double4){0,0,0,1})) { }
+
+ /*! @abstract Constructs a C++ quaternion from a C quaternion. */
+ quatd(::simd_quatd q) : ::simd_quatd(q) { }
+
+ /*! @abstract Constructs a quaternion from components. */
+ quatd(double ix, double iy, double iz, double r) : ::simd_quatd(::simd_quaternion(ix, iy, iz, r)) { }
+
+ /*! @abstract Constructs a quaternion from an array of scalars. */
+ quatd(const double xyzr[4]) : ::simd_quatd(::simd_quaternion(xyzr)) { }
+
+ /*! @abstract Constructs a quaternion from a vector. */
+ quatd(double4 xyzr) : ::simd_quatd(::simd_quaternion(xyzr)) { }
+
+ /*! @abstract Quaternion representing rotation about `axis` by `angle`
+ * radians. */
+ quatd(double angle, double3 axis) : ::simd_quatd(::simd_quaternion(angle, axis)) { }
+
+ /*! @abstract Quaternion that rotates `from` into `to`. */
+ quatd(double3 from, double3 to) : ::simd_quatd(::simd_quaternion(from, to)) { }
+
+ /*! @abstract Constructs a quaternion from a rotation matrix. */
+ quatd(::simd_double3x3 matrix) : ::simd_quatd(::simd_quaternion(matrix)) { }
+
+ /*! @abstract Constructs a quaternion from a rotation matrix. */
+ quatd(::simd_double4x4 matrix) : ::simd_quatd(::simd_quaternion(matrix)) { }
+
+ /*! @abstract The real (scalar) part of the quaternion. */
+ double real(void) const { return ::simd_real(*this); }
+
+ /*! @abstract The imaginary (vector) part of the quaternion. */
+ double3 imag(void) const { return ::simd_imag(*this); }
+
+ /*! @abstract The angle the quaternion rotates by. */
+ double angle(void) const { return ::simd_angle(*this); }
+
+ /*! @abstract The axis the quaternion rotates about. */
+ double3 axis(void) const { return ::simd_axis(*this); }
+
+ /*! @abstract The length of the quaternion. */
+ double length(void) const { return ::simd_length(*this); }
+
+ /*! @abstract Act on the vector `v` by rotation. */
+ double3 operator()(const ::simd_double3 v) const { return ::simd_act(*this, v); }
+ };
+
+ static SIMD_CPPFUNC quatd operator+(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_add(p, q); }
+ static SIMD_CPPFUNC quatd operator-(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_sub(p, q); }
+ static SIMD_CPPFUNC quatd operator-(const ::simd_quatd p) { return ::simd_negate(p); }
+ static SIMD_CPPFUNC quatd operator*(const double r, const ::simd_quatd p) { return ::simd_mul(r, p); }
+ static SIMD_CPPFUNC quatd operator*(const ::simd_quatd p, const double r) { return ::simd_mul(p, r); }
+ static SIMD_CPPFUNC quatd operator*(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_mul(p, q); }
+ static SIMD_CPPFUNC quatd operator/(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_mul(p, ::simd_inverse(q)); }
+ static SIMD_CPPFUNC quatd operator+=(quatd &p, const ::simd_quatd q) { return p = p+q; }
+ static SIMD_CPPFUNC quatd operator-=(quatd &p, const ::simd_quatd q) { return p = p-q; }
+ static SIMD_CPPFUNC quatd operator*=(quatd &p, const double r) { return p = p*r; }
+ static SIMD_CPPFUNC quatd operator*=(quatd &p, const ::simd_quatd q) { return p = p*q; }
+ static SIMD_CPPFUNC quatd operator/=(quatd &p, const ::simd_quatd q) { return p = p/q; }
+
+ /*! @abstract The conjugate of the quaternion `q`. */
+ static SIMD_CPPFUNC quatd conjugate(const ::simd_quatd p) { return ::simd_conjugate(p); }
+
+ /*! @abstract The (multiplicative) inverse of the quaternion `q`. */
+ static SIMD_CPPFUNC quatd inverse(const ::simd_quatd p) { return ::simd_inverse(p); }
+
+ /*! @abstract The dot product of the quaternions `p` and `q` interpreted as
+ * four-dimensional vectors. */
+ static SIMD_CPPFUNC double dot(const ::simd_quatd p, const ::simd_quatd q) { return ::simd_dot(p, q); }
+
+ /*! @abstract The unit quaternion obtained by normalizing `q`. */
+ static SIMD_CPPFUNC quatd normalize(const ::simd_quatd p) { return ::simd_normalize(p); }
+
+ /*! @abstract logarithm of the quaternion `q`. */
+ static SIMD_CPPFUNC quatd log(const ::simd_quatd q) { return ::__tg_log(q); }
+
+ /*! @abstract exponential map of quaterion `q`. */
+ static SIMD_CPPFUNC quatd exp(const ::simd_quatd q) { return ::__tg_exp(q); }
+
+ /*! @abstract Spherical linear interpolation along the shortest arc between
+ * quaternions `q0` and `q1`. */
+ static SIMD_CPPFUNC quatd slerp(const ::simd_quatd p0, const ::simd_quatd p1, double t) { return ::simd_slerp(p0, p1, t); }
+
+ /*! @abstract Spherical linear interpolation along the longest arc between
+ * quaternions `q0` and `q1`. */
+ static SIMD_CPPFUNC quatd slerp_longest(const ::simd_quatd p0, const ::simd_quatd p1, double t) { return ::simd_slerp_longest(p0, p1, t); }
+
+ /*! @abstract Interpolate between quaternions along a spherical cubic spline.
+ *
+ * @discussion The function interpolates between q1 and q2. q0 is the left
+ * endpoint of the previous interval, and q3 is the right endpoint of the next
+ * interval. Use this function to smoothly interpolate between a sequence of
+ * rotations. */
+ static SIMD_CPPFUNC quatd spline(const ::simd_quatd p0, const ::simd_quatd p1, const ::simd_quatd p2, const ::simd_quatd p3, double t) { return ::simd_spline(p0, p1, p2, p3, t); }
+
+ /*! @abstract Spherical cubic Bezier interpolation between quaternions.
+ *
+ * @discussion The function treats q0 ... q3 as control points and uses slerp
+ * in place of lerp in the De Castlejeau algorithm. The endpoints of
+ * interpolation are thus q0 and q3, and the curve will not generally pass
+ * through q1 or q2. Note that the convex hull property of "standard" Bezier
+ * curve does not hold on the sphere. */
+ static SIMD_CPPFUNC quatd bezier(const ::simd_quatd p0, const ::simd_quatd p1, const ::simd_quatd p2, const ::simd_quatd p3, double t) { return ::simd_bezier(p0, p1, p2, p3, t); }
+}
+
+extern "C" {
+#endif /* __cplusplus */
+
+/* MARK: - double implementations */
+
+#include <simd/math.h>
+#include <simd/geometry.h>
+
+/* tg_promote is implementation gobbledygook that enables the compile-time
+ * dispatching in tgmath.h to work its magic. */
+static simd_quatd __attribute__((__overloadable__)) __tg_promote(simd_quatd);
+
+/*! @abstract Constructs a quaternion from imaginary and real parts.
+ * @discussion This function is hidden behind an underscore to avoid confusion
+ * with the angle-axis constructor. */
+static inline SIMD_CFUNC simd_quatd _simd_quaternion(simd_double3 imag, double real) {
+ return simd_quaternion(simd_make_double4(imag, real));
+}
+
+static inline SIMD_CFUNC simd_quatd simd_quaternion(double angle, simd_double3 axis) {
+ return _simd_quaternion(sin(angle/2) * axis, cos(angle/2));
+}
+
+static inline SIMD_CFUNC double simd_angle(simd_quatd q) {
+ return 2*atan2(simd_length(q.vector.xyz), q.vector.w);
+}
+
+static inline SIMD_CFUNC simd_double3 simd_axis(simd_quatd q) {
+ return simd_normalize(q.vector.xyz);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_add(simd_quatd p, simd_quatd q) {
+ return simd_quaternion(p.vector + q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_sub(simd_quatd p, simd_quatd q) {
+ return simd_quaternion(p.vector - q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd p, simd_quatd q) {
+ #pragma STDC FP_CONTRACT ON
+ return simd_quaternion((p.vector.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+ p.vector.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5)) +
+ (p.vector.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6) +
+ p.vector.w * q.vector));
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(simd_quatd q, double a) {
+ return simd_quaternion(a * q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_mul(double a, simd_quatd q) {
+ return simd_mul(q,a);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_conjugate(simd_quatd q) {
+ return simd_quaternion(q.vector * (simd_double4){-1,-1,-1, 1});
+}
+
+static inline SIMD_CFUNC simd_quatd simd_inverse(simd_quatd q) {
+ return simd_quaternion(simd_conjugate(q).vector * simd_recip(simd_length_squared(q.vector)));
+}
+
+static inline SIMD_CFUNC simd_quatd simd_negate(simd_quatd q) {
+ return simd_quaternion(-q.vector);
+}
+
+static inline SIMD_CFUNC double simd_dot(simd_quatd p, simd_quatd q) {
+ return simd_dot(p.vector, q.vector);
+}
+
+static inline SIMD_CFUNC double simd_length(simd_quatd q) {
+ return simd_length(q.vector);
+}
+
+static inline SIMD_CFUNC simd_quatd simd_normalize(simd_quatd q) {
+ double length_squared = simd_length_squared(q.vector);
+ if (length_squared == 0) {
+ return simd_quaternion((simd_double4){0,0,0,1});
+ }
+ return simd_quaternion(q.vector * simd_rsqrt(length_squared));
+}
+
+#if defined __arm__ || defined __arm64__
+/*! @abstract Multiplies the vector `v` by the quaternion `q`.
+ *
+ * @discussion This IS NOT the action of `q` on `v` (i.e. this is not rotation
+ * by `q`. That operation is provided by `simd_act(q, v)`. This function is an
+ * implementation detail and you should not call it directly. It may be
+ * removed or modified in future versions of the simd module. */
+static inline SIMD_CFUNC simd_quatd _simd_mul_vq(simd_double3 v, simd_quatd q) {
+ #pragma STDC FP_CONTRACT ON
+ return simd_quaternion(v.x * __builtin_shufflevector(q.vector, -q.vector, 3,6,1,4) +
+ v.y * __builtin_shufflevector(q.vector, -q.vector, 2,3,4,5) +
+ v.z * __builtin_shufflevector(q.vector, -q.vector, 5,0,3,6));
+}
+#endif
+
+static inline SIMD_CFUNC simd_double3 simd_act(simd_quatd q, simd_double3 v) {
+#if defined __arm__ || defined __arm64__
+ return simd_mul(q, _simd_mul_vq(v, simd_conjugate(q))).vector.xyz;
+#else
+ #pragma STDC FP_CONTRACT ON
+ simd_double3 t = 2*simd_cross(simd_imag(q),v);
+ return v + simd_real(q)*t + simd_cross(simd_imag(q), t);
+#endif
+}
+
+static SIMD_NOINLINE simd_quatd __tg_log(simd_quatd q) {
+ double real = __tg_log(simd_length_squared(q.vector))/2;
+ if (simd_equal(simd_imag(q), 0)) return _simd_quaternion(0, real);
+ simd_double3 imag = __tg_acos(simd_real(q)/simd_length(q)) * simd_normalize(simd_imag(q));
+ return _simd_quaternion(imag, real);
+}
+
+static SIMD_NOINLINE simd_quatd __tg_exp(simd_quatd q) {
+ // angle is actually *twice* the angle of the rotation corresponding to
+ // the resulting quaternion, which is why we don't simply use the (angle,
+ // axis) constructor to generate `unit`.
+ double angle = simd_length(simd_imag(q));
+ if (angle == 0) return _simd_quaternion(0, exp(simd_real(q)));
+ simd_double3 axis = simd_normalize(simd_imag(q));
+ simd_quatd unit = _simd_quaternion(sin(angle)*axis, cosf(angle));
+ return simd_mul(exp(simd_real(q)), unit);
+}
+
+/*! @abstract Implementation detail of the `simd_quaternion(from, to)`
+ * initializer.
+ *
+ * @discussion Computes the quaternion rotation `from` to `to` if they are
+ * separated by less than 90 degrees. Not numerically stable for larger
+ * angles. This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static inline SIMD_CFUNC simd_quatd _simd_quaternion_reduced(simd_double3 from, simd_double3 to) {
+ simd_double3 half = simd_normalize(from + to);
+ return _simd_quaternion(simd_cross(from, half), simd_dot(from, half));
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3 from, simd_double3 to) {
+
+ // If the angle between from and to is not too big, we can compute the
+ // rotation accurately using a simple implementation.
+ if (simd_dot(from, to) >= 0) {
+ return _simd_quaternion_reduced(from, to);
+ }
+
+ // Because from and to are more than 90 degrees apart, we compute the
+ // rotation in two stages (from -> half), (half -> to) to preserve numerical
+ // accuracy.
+ simd_double3 half = from + to;
+
+ if (simd_length_squared(half) == 0) {
+ // half is nearly zero, so from and to point in nearly opposite directions
+ // and the rotation is numerically underspecified. Pick an axis orthogonal
+ // to the vectors, and use an angle of pi radians.
+ simd_double3 abs_from = simd_abs(from);
+ if (abs_from.x <= abs_from.y && abs_from.x <= abs_from.z)
+ return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){1,0,0})), 0.f);
+ else if (abs_from.y <= abs_from.z)
+ return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){0,1,0})), 0.f);
+ else
+ return _simd_quaternion(simd_normalize(simd_cross(from, (simd_double3){0,0,1})), 0.f);
+ }
+
+ // Compute the two-step rotation. */
+ half = simd_normalize(half);
+ return simd_mul(_simd_quaternion_reduced(from, half),
+ _simd_quaternion_reduced(half, to));
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double3x3 matrix) {
+ const simd_double3 *mat = matrix.columns;
+ double trace = mat[0][0] + mat[1][1] + mat[2][2];
+ if (trace >= 0.0) {
+ double r = 2*sqrt(1 + trace);
+ double rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]),
+ rinv*(mat[0][1] - mat[1][0]),
+ r/4);
+ } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+ double r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+ double rinv = simd_recip(r);
+ return simd_quaternion(r/4,
+ rinv*(mat[0][1] + mat[1][0]),
+ rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] - mat[2][1]));
+ } else if (mat[1][1] >= mat[2][2]) {
+ double r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+ double rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+ r/4,
+ rinv*(mat[1][2] + mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]));
+ } else {
+ double r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+ double rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] + mat[2][1]),
+ r/4,
+ rinv*(mat[0][1] - mat[1][0]));
+ }
+}
+
+static SIMD_NOINLINE simd_quatd simd_quaternion(simd_double4x4 matrix) {
+ const simd_double4 *mat = matrix.columns;
+ double trace = mat[0][0] + mat[1][1] + mat[2][2];
+ if (trace >= 0.0) {
+ double r = 2*sqrt(1 + trace);
+ double rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[1][2] - mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]),
+ rinv*(mat[0][1] - mat[1][0]),
+ r/4);
+ } else if (mat[0][0] >= mat[1][1] && mat[0][0] >= mat[2][2]) {
+ double r = 2*sqrt(1 - mat[1][1] - mat[2][2] + mat[0][0]);
+ double rinv = simd_recip(r);
+ return simd_quaternion(r/4,
+ rinv*(mat[0][1] + mat[1][0]),
+ rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] - mat[2][1]));
+ } else if (mat[1][1] >= mat[2][2]) {
+ double r = 2*sqrt(1 - mat[0][0] - mat[2][2] + mat[1][1]);
+ double rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][1] + mat[1][0]),
+ r/4,
+ rinv*(mat[1][2] + mat[2][1]),
+ rinv*(mat[2][0] - mat[0][2]));
+ } else {
+ double r = 2*sqrt(1 - mat[0][0] - mat[1][1] + mat[2][2]);
+ double rinv = simd_recip(r);
+ return simd_quaternion(rinv*(mat[0][2] + mat[2][0]),
+ rinv*(mat[1][2] + mat[2][1]),
+ r/4,
+ rinv*(mat[0][1] - mat[1][0]));
+ }
+}
+
+/*! @abstract The angle between p and q interpreted as 4-dimensional vectors.
+ *
+ * @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_NOINLINE double _simd_angle(simd_quatd p, simd_quatd q) {
+ return 2*atan2(simd_length(p.vector - q.vector), simd_length(p.vector + q.vector));
+}
+
+/*! @abstract sin(x)/x.
+ *
+ * @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_CFUNC double _simd_sinc(double x) {
+ if (x == 0) return 1;
+ return sin(x)/x;
+}
+
+/*! @abstract Spherical lerp between q0 and q1.
+ *
+ * @discussion This function may interpolate along either the longer or
+ * shorter path between q0 and q1; it is used as an implementation detail
+ * in `simd_slerp` and `simd_slerp_longest`; you should use those functions
+ * instead of calling this directly. */
+static SIMD_NOINLINE simd_quatd _simd_slerp_internal(simd_quatd q0, simd_quatd q1, double t) {
+ double s = 1 - t;
+ double a = _simd_angle(q0, q1);
+ double r = simd_recip(_simd_sinc(a));
+ return simd_normalize(simd_quaternion(_simd_sinc(s*a)*r*s*q0.vector + _simd_sinc(t*a)*r*t*q1.vector));
+}
+
+static SIMD_NOINLINE simd_quatd simd_slerp(simd_quatd q0, simd_quatd q1, double t) {
+ if (simd_dot(q0, q1) >= 0)
+ return _simd_slerp_internal(q0, q1, t);
+ return _simd_slerp_internal(q0, simd_negate(q1), t);
+}
+
+static SIMD_NOINLINE simd_quatd simd_slerp_longest(simd_quatd q0, simd_quatd q1, double t) {
+ if (simd_dot(q0, q1) >= 0)
+ return _simd_slerp_internal(q0, simd_negate(q1), t);
+ return _simd_slerp_internal(q0, q1, t);
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_NOINLINE simd_quatd _simd_intermediate(simd_quatd q0, simd_quatd q1, simd_quatd q2) {
+ simd_quatd p0 = __tg_log(simd_mul(q0, simd_inverse(q1)));
+ simd_quatd p2 = __tg_log(simd_mul(q2, simd_inverse(q1)));
+ return simd_normalize(simd_mul(q1, __tg_exp(simd_mul(-0.25, simd_add(p0,p2)))));
+}
+
+/*! @discussion This function is an implementation detail and you should not
+ * call it directly. It may be removed or modified in future versions of the
+ * simd module. */
+static SIMD_NOINLINE simd_quatd _simd_squad(simd_quatd q0, simd_quatd qa, simd_quatd qb, simd_quatd q1, double t) {
+ simd_quatd r0 = _simd_slerp_internal(q0, q1, t);
+ simd_quatd r1 = _simd_slerp_internal(qa, qb, t);
+ return _simd_slerp_internal(r0, r1, 2*t*(1 - t));
+}
+
+static SIMD_NOINLINE simd_quatd simd_spline(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t) {
+ simd_quatd qa = _simd_intermediate(q0, q1, q2);
+ simd_quatd qb = _simd_intermediate(q1, q2, q3);
+ return _simd_squad(q1, qa, qb, q2, t);
+}
+
+static SIMD_NOINLINE simd_quatd simd_bezier(simd_quatd q0, simd_quatd q1, simd_quatd q2, simd_quatd q3, double t) {
+ simd_quatd q01 = _simd_slerp_internal(q0, q1, t);
+ simd_quatd q12 = _simd_slerp_internal(q1, q2, t);
+ simd_quatd q23 = _simd_slerp_internal(q2, q3, t);
+ simd_quatd q012 = _simd_slerp_internal(q01, q12, t);
+ simd_quatd q123 = _simd_slerp_internal(q12, q23, t);
+ return _simd_slerp_internal(q012, q123, t);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+#endif /* SIMD_COMPILER_HAS_REQUIRED_FEATURES */
+#endif /* SIMD_QUATERNIONS */ \ No newline at end of file