diff options
| author | Andrew Kelley <andrew@ziglang.org> | 2020-01-22 17:42:02 -0500 |
|---|---|---|
| committer | Andrew Kelley <andrew@ziglang.org> | 2020-01-22 17:42:44 -0500 |
| commit | 74872263cc83bb7cbd8d548e83b441f4bf2f2249 (patch) | |
| tree | 6c4e7bb2cbcb97d3b16259ecebcb05915921ecfe /lib/include/ppc_wrappers | |
| parent | 97b2ac598b91194c09a96c9ed86e4f9b266d019c (diff) | |
| download | zig-74872263cc83bb7cbd8d548e83b441f4bf2f2249.tar.gz zig-74872263cc83bb7cbd8d548e83b441f4bf2f2249.zip | |
lib/headers: update to clang 10.x C headers
upstream revision: 3cce3790072249cbe51b96cea26bc78019c11fd0
Diffstat (limited to 'lib/include/ppc_wrappers')
| -rw-r--r-- | lib/include/ppc_wrappers/emmintrin.h | 10 | ||||
| -rw-r--r-- | lib/include/ppc_wrappers/mm_malloc.h | 6 | ||||
| -rw-r--r-- | lib/include/ppc_wrappers/mmintrin.h | 7 | ||||
| -rw-r--r-- | lib/include/ppc_wrappers/pmmintrin.h | 150 | ||||
| -rw-r--r-- | lib/include/ppc_wrappers/smmintrin.h | 85 | ||||
| -rw-r--r-- | lib/include/ppc_wrappers/tmmintrin.h | 495 | ||||
| -rw-r--r-- | lib/include/ppc_wrappers/xmmintrin.h | 6 |
7 files changed, 757 insertions, 2 deletions
diff --git a/lib/include/ppc_wrappers/emmintrin.h b/lib/include/ppc_wrappers/emmintrin.h index 617ce24acd..4dcb8485e2 100644 --- a/lib/include/ppc_wrappers/emmintrin.h +++ b/lib/include/ppc_wrappers/emmintrin.h @@ -35,6 +35,8 @@ #ifndef EMMINTRIN_H_ #define EMMINTRIN_H_ +#if defined(__linux__) && defined(__ppc64__) + #include <altivec.h> /* We need definitions from the SSE header files. */ @@ -1747,7 +1749,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B) lshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (lshift, shmax); result = vec_sl ((__v2du) __A, lshift); - result = vec_sel ((__v2du) shmask, result, shmask); + result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); return (__m128i) result; } @@ -1841,7 +1843,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B) rshift = vec_splat ((__v2du) __B, 0); shmask = vec_cmplt (rshift, shmax); result = vec_sr ((__v2du) __A, rshift); - result = vec_sel ((__v2du) shmask, result, shmask); + result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask); return (__m128i) result; } @@ -2315,4 +2317,8 @@ _mm_castsi128_pd(__m128i __A) return (__m128d) __A; } +#else +#include_next <emmintrin.h> +#endif /* defined(__linux__) && defined(__ppc64__) */ + #endif /* EMMINTRIN_H_ */ diff --git a/lib/include/ppc_wrappers/mm_malloc.h b/lib/include/ppc_wrappers/mm_malloc.h index d91d7865c8..24b14c8e07 100644 --- a/lib/include/ppc_wrappers/mm_malloc.h +++ b/lib/include/ppc_wrappers/mm_malloc.h @@ -10,6 +10,8 @@ #ifndef _MM_MALLOC_H_INCLUDED #define _MM_MALLOC_H_INCLUDED +#if defined(__linux__) && defined(__ppc64__) + #include <stdlib.h> /* We can't depend on <stdlib.h> since the prototype of posix_memalign @@ -41,4 +43,8 @@ _mm_free (void * ptr) free (ptr); } +#else +#include_next <mm_malloc.h> +#endif + #endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/lib/include/ppc_wrappers/mmintrin.h b/lib/include/ppc_wrappers/mmintrin.h index b949653adf..c55c44726f 100644 --- a/lib/include/ppc_wrappers/mmintrin.h +++ b/lib/include/ppc_wrappers/mmintrin.h @@ -35,6 +35,8 @@ #ifndef _MMINTRIN_H_INCLUDED #define _MMINTRIN_H_INCLUDED +#if defined(__linux__) && defined(__ppc64__) + #include <altivec.h> /* The Intel API is flexible enough that we must allow aliasing with other vector types, and their scalar components. */ @@ -1440,4 +1442,9 @@ extern __inline __m64 return (res.as_m64); #endif } + +#else +#include_next <mmintrin.h> +#endif /* defined(__linux__) && defined(__ppc64__) */ + #endif /* _MMINTRIN_H_INCLUDED */ diff --git a/lib/include/ppc_wrappers/pmmintrin.h b/lib/include/ppc_wrappers/pmmintrin.h new file mode 100644 index 0000000000..6d93383d54 --- /dev/null +++ b/lib/include/ppc_wrappers/pmmintrin.h @@ -0,0 +1,150 @@ +/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. + + In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA + is a good match for most SIMD operations. However the Horizontal + add/sub requires the data pairs be permuted into a separate + registers with vertical even/odd alignment for the operation. + And the addsub operation requires the sign of only the even numbered + elements be flipped (xored with -0.0). + For larger blocks of code using these intrinsic implementations, + the compiler be should be able to schedule instructions to avoid + additional latency. + + In the specific case of the monitor and mwait instructions there are + no direct equivalent in the PowerISA at this time. So those + intrinsics are not implemented. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." +#endif + +#ifndef PMMINTRIN_H_ +#define PMMINTRIN_H_ + +#if defined(__linux__) && defined(__ppc64__) + +/* We need definitions from the SSE2 and SSE header files*/ +#include <emmintrin.h> + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_ps (__m128 __X, __m128 __Y) +{ + const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; + __v4sf even_neg_Y = vec_xor(__Y, even_n0); + return (__m128) vec_add (__X, even_neg_Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_pd (__m128d __X, __m128d __Y) +{ + const __v2df even_n0 = {-0.0, 0.0}; + __v2df even_neg_Y = vec_xor(__Y, even_n0); + return (__m128d) vec_add (__X, even_neg_Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_ps (__m128 __X, __m128 __Y) +{ + __vector unsigned char xform2 = { + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B + }; + __vector unsigned char xform1 = { + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F + }; + return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_ps (__m128 __X, __m128 __Y) +{ + __vector unsigned char xform2 = { + 0x00, 0x01, 0x02, 0x03, + 0x08, 0x09, 0x0A, 0x0B, + 0x10, 0x11, 0x12, 0x13, + 0x18, 0x19, 0x1A, 0x1B + }; + __vector unsigned char xform1 = { + 0x04, 0x05, 0x06, 0x07, + 0x0C, 0x0D, 0x0E, 0x0F, + 0x14, 0x15, 0x16, 0x17, + 0x1C, 0x1D, 0x1E, 0x1F + }; + return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), + vec_mergel ((__v2df) __X, (__v2df)__Y)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), + vec_mergel ((__v2df) __X, (__v2df)__Y)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehdup_ps (__m128 __X) +{ + return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_moveldup_ps (__m128 __X) +{ + return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loaddup_pd (double const *__P) +{ + return (__m128d) vec_splats (*__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movedup_pd (__m128d __X) +{ + return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lddqu_si128 (__m128i const *__P) +{ + return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); +} + +/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ + +#else +#include_next <pmmintrin.h> +#endif /* defined(__linux__) && defined(__ppc64__) */ + +#endif /* PMMINTRIN_H_ */ diff --git a/lib/include/ppc_wrappers/smmintrin.h b/lib/include/ppc_wrappers/smmintrin.h new file mode 100644 index 0000000000..56ef6ba76b --- /dev/null +++ b/lib/include/ppc_wrappers/smmintrin.h @@ -0,0 +1,85 @@ +/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. + + NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerp64/powerpc64le. + + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#error \ + "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." +#endif + +#ifndef SMMINTRIN_H_ +#define SMMINTRIN_H_ + +#if defined(__linux__) && defined(__ppc64__) + +#include <altivec.h> +#include <emmintrin.h> + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi8(__m128i __X, const int __N) { + return (unsigned char)((__v16qi)__X)[__N & 15]; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi32(__m128i __X, const int __N) { + return ((__v4si)__X)[__N & 3]; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_epi64(__m128i __X, const int __N) { + return ((__v2di)__X)[__N & 1]; +} + +extern __inline int + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_extract_ps(__m128 __X, const int __N) { + return ((__v4si)__X)[__N & 3]; +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { + __v16qi __charmask = vec_splats((signed char)__imm8); + __charmask = vec_gb(__charmask); + __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); +#ifdef __BIG_ENDIAN__ + __shortmask = vec_reve(__shortmask); +#endif + return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); +} + +extern __inline __m128i + __attribute__((__gnu_inline__, __always_inline__, __artificial__)) + _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { + const __v16qu __seven = vec_splats((unsigned char)0x07); + __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); + return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask); +} + +#else +#include_next <smmintrin.h> +#endif /* defined(__linux__) && defined(__ppc64__) */ + +#endif /* _SMMINTRIN_H_ */ diff --git a/lib/include/ppc_wrappers/tmmintrin.h b/lib/include/ppc_wrappers/tmmintrin.h new file mode 100644 index 0000000000..b5a935d5e4 --- /dev/null +++ b/lib/include/ppc_wrappers/tmmintrin.h @@ -0,0 +1,495 @@ +/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. */ +#endif + +#ifndef TMMINTRIN_H_ +#define TMMINTRIN_H_ + +#if defined(__linux__) && defined(__ppc64__) + +#include <altivec.h> + +/* We need definitions from the SSE header files. */ +#include <pmmintrin.h> + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi16 (__m128i __A) +{ + return (__m128i) vec_abs ((__v8hi) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi32 (__m128i __A) +{ + return (__m128i) vec_abs ((__v4si) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi8 (__m128i __A) +{ + return (__m128i) vec_abs ((__v16qi) __A); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi16 (__m64 __A) +{ + __v8hi __B = (__v8hi) (__v2du) { __A, __A }; + return (__m64) ((__v2du) vec_abs (__B))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi32 (__m64 __A) +{ + __v4si __B = (__v4si) (__v2du) { __A, __A }; + return (__m64) ((__v2du) vec_abs (__B))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi8 (__m64 __A) +{ + __v16qi __B = (__v16qi) (__v2du) { __A, __A }; + return (__m64) ((__v2du) vec_abs (__B))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count) +{ + if (__builtin_constant_p (__count) && __count < 16) + { +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_reve ((__v16qu) __A); + __B = (__m128i) vec_reve ((__v16qu) __B); +#endif + __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count); +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_reve ((__v16qu) __A); +#endif + return __A; + } + + if (__count == 0) + return __B; + + if (__count >= 16) + { + if (__count >= 32) + { + const __v16qu zero = { 0 }; + return (__m128i) zero; + } + else + { + const __v16qu __shift = + vec_splats ((unsigned char) ((__count - 16) * 8)); +#ifdef __LITTLE_ENDIAN__ + return (__m128i) vec_sro ((__v16qu) __A, __shift); +#else + return (__m128i) vec_slo ((__v16qu) __A, __shift); +#endif + } + } + else + { + const __v16qu __shiftA = + vec_splats ((unsigned char) ((16 - __count) * 8)); + const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8)); +#ifdef __LITTLE_ENDIAN__ + __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA); + __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB); +#else + __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA); + __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB); +#endif + return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B); + } +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count) +{ + if (__count < 16) + { + __v2du __C = { __B, __A }; +#ifdef __LITTLE_ENDIAN__ + const __v4su __shift = { __count << 3, 0, 0, 0 }; + __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift); +#else + const __v4su __shift = { 0, 0, 0, __count << 3 }; + __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift); +#endif + return (__m64) __C[0]; + } + else + { + const __m64 __zero = { 0 }; + return __zero; + } +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi16 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); + __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); + return (__m128i) vec_add (__C, __D); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi32 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; + __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); + __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); + return (__m128i) vec_add (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi16 (__m64 __A, __m64 __B) +{ + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; + __v8hi __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_add (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi32 (__m64 __A, __m64 __B) +{ + __v4si __C = (__v4si) (__v2du) { __A, __B }; + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; + __v4si __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_add (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_epi16 (__m128i __A, __m128i __B) +{ + __v4si __C = { 0 }, __D = { 0 }; + __C = vec_sum4s ((__v8hi) __A, __C); + __D = vec_sum4s ((__v8hi) __B, __D); + __C = (__v4si) vec_packs (__C, __D); + return (__m128i) __C; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_pi16 (__m64 __A, __m64 __B) +{ + const __v4si __zero = { 0 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + __v4si __D = vec_sum4s (__C, __zero); + __C = vec_packs (__D, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi16 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); + __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); + return (__m128i) vec_sub (__C, __D); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi32 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; + __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P); + __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q); + return (__m128i) vec_sub (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi16 (__m64 __A, __m64 __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + __v8hi __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_sub (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi32 (__m64 __A, __m64 __B) +{ + const __v16qu __P = + { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 }; + const __v16qu __Q = + { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 }; + __v4si __C = (__v4si) (__v2du) { __A, __B }; + __v4si __D = vec_perm (__C, __C, __Q); + __C = vec_perm (__C, __C, __P); + __C = vec_sub (__C, __D); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_epi16 (__m128i __A, __m128i __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P); + __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q); + return (__m128i) vec_subs (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_pi16 (__m64 __A, __m64 __B) +{ + const __v16qu __P = + { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 }; + const __v16qu __Q = + { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __B }; + __v8hi __D = vec_perm (__C, __C, __P); + __v8hi __E = vec_perm (__C, __C, __Q); + __C = vec_subs (__D, __E); + return (__m64) ((__v2du) __C)[1]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8 (__m128i __A, __m128i __B) +{ + const __v16qi __zero = { 0 }; + __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero); + __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B); + return (__m128i) vec_sel (__C, __zero, __select); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi8 (__m64 __A, __m64 __B) +{ + const __v16qi __zero = { 0 }; + __v16qi __C = (__v16qi) (__v2du) { __A, __A }; + __v16qi __D = (__v16qi) (__v2du) { __B, __B }; + __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero); + __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D); + __C = vec_sel (__C, __zero, __select); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi8 (__m128i __A, __m128i __B) +{ + const __v16qi __zero = { 0 }; + __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero); + __v16qi __selectpos = + (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero)); + __v16qi __conv = vec_add (__selectneg, __selectpos); + return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi16 (__m128i __A, __m128i __B) +{ + const __v8hi __zero = { 0 }; + __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero); + __v8hi __selectpos = + (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero)); + __v8hi __conv = vec_add (__selectneg, __selectpos); + return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi32 (__m128i __A, __m128i __B) +{ + const __v4si __zero = { 0 }; + __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero); + __v4si __selectpos = + (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero)); + __v4si __conv = vec_add (__selectneg, __selectpos); + return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi8 (__m64 __A, __m64 __B) +{ + const __v16qi __zero = { 0 }; + __v16qi __C = (__v16qi) (__v2du) { __A, __A }; + __v16qi __D = (__v16qi) (__v2du) { __B, __B }; + __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi16 (__m64 __A, __m64 __B) +{ + const __v8hi __zero = { 0 }; + __v8hi __C = (__v8hi) (__v2du) { __A, __A }; + __v8hi __D = (__v8hi) (__v2du) { __B, __B }; + __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi32 (__m64 __A, __m64 __B) +{ + const __v4si __zero = { 0 }; + __v4si __C = (__v4si) (__v2du) { __A, __A }; + __v4si __D = (__v4si) (__v2du) { __B, __B }; + __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_epi16 (__m128i __A, __m128i __B) +{ + __v8hi __unsigned = vec_splats ((signed short) 0x00ff); + __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned); + __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned); + __v8hi __E = vec_unpackh ((__v16qi) __B); + __v8hi __F = vec_unpackl ((__v16qi) __B); + __C = vec_mul (__C, __E); + __D = vec_mul (__D, __F); + const __v16qu __odds = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __evens = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __E = vec_perm (__C, __D, __odds); + __F = vec_perm (__C, __D, __evens); + return (__m128i) vec_adds (__E, __F); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_pi16 (__m64 __A, __m64 __B) +{ + __v8hi __C = (__v8hi) (__v2du) { __A, __A }; + __C = vec_unpackl ((__v16qi) __C); + const __v8hi __unsigned = vec_splats ((signed short) 0x00ff); + __C = vec_and (__C, __unsigned); + __v8hi __D = (__v8hi) (__v2du) { __B, __B }; + __D = vec_unpackl ((__v16qi) __D); + __D = vec_mul (__C, __D); + const __v16qu __odds = + { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }; + const __v16qu __evens = + { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }; + __C = vec_perm (__D, __D, __odds); + __D = vec_perm (__D, __D, __evens); + __C = vec_adds (__C, __D); + return (__m64) ((__v2du) (__C))[0]; +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_epi16 (__m128i __A, __m128i __B) +{ + __v4si __C = vec_unpackh ((__v8hi) __A); + __v4si __D = vec_unpackh ((__v8hi) __B); + __C = vec_mul (__C, __D); + __D = vec_unpackl ((__v8hi) __A); + __v4si __E = vec_unpackl ((__v8hi) __B); + __D = vec_mul (__D, __E); + const __v4su __shift = vec_splats ((unsigned int) 14); + __C = vec_sr (__C, __shift); + __D = vec_sr (__D, __shift); + const __v4si __ones = vec_splats ((signed int) 1); + __C = vec_add (__C, __ones); + __C = vec_sr (__C, (__v4su) __ones); + __D = vec_add (__D, __ones); + __D = vec_sr (__D, (__v4su) __ones); + return (__m128i) vec_pack (__C, __D); +} + +extern __inline __m64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_pi16 (__m64 __A, __m64 __B) +{ + __v4si __C = (__v4si) (__v2du) { __A, __A }; + __C = vec_unpackh ((__v8hi) __C); + __v4si __D = (__v4si) (__v2du) { __B, __B }; + __D = vec_unpackh ((__v8hi) __D); + __C = vec_mul (__C, __D); + const __v4su __shift = vec_splats ((unsigned int) 14); + __C = vec_sr (__C, __shift); + const __v4si __ones = vec_splats ((signed int) 1); + __C = vec_add (__C, __ones); + __C = vec_sr (__C, (__v4su) __ones); + __v8hi __E = vec_pack (__C, __D); + return (__m64) ((__v2du) (__E))[0]; +} + +#else +#include_next <tmmintrin.h> +#endif /* defined(__linux__) && defined(__ppc64__) */ + +#endif /* TMMINTRIN_H_ */ diff --git a/lib/include/ppc_wrappers/xmmintrin.h b/lib/include/ppc_wrappers/xmmintrin.h index 1b322b6651..0f429fa040 100644 --- a/lib/include/ppc_wrappers/xmmintrin.h +++ b/lib/include/ppc_wrappers/xmmintrin.h @@ -34,6 +34,8 @@ #ifndef _XMMINTRIN_H_INCLUDED #define _XMMINTRIN_H_INCLUDED +#if defined(__linux__) && defined(__ppc64__) + /* Define four value permute mask */ #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) @@ -1835,4 +1837,8 @@ do { \ /* For backward source compatibility. */ //# include <emmintrin.h> +#else +#include_next <xmmintrin.h> +#endif /* defined(__linux__) && defined(__ppc64__) */ + #endif /* _XMMINTRIN_H_INCLUDED */ |
