1a7dea167SDimitry Andric /*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------=== 2a7dea167SDimitry Andric * 3a7dea167SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4a7dea167SDimitry Andric * See https://llvm.org/LICENSE.txt for license information. 5a7dea167SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6a7dea167SDimitry Andric * 7a7dea167SDimitry Andric *===-----------------------------------------------------------------------=== 8a7dea167SDimitry Andric */ 9a7dea167SDimitry Andric 10a7dea167SDimitry Andric /* Implemented from the specification included in the Intel C++ Compiler 11a7dea167SDimitry Andric User Guide and Reference, version 9.0. */ 12a7dea167SDimitry Andric 13a7dea167SDimitry Andric #ifndef NO_WARN_X86_INTRINSICS 14a7dea167SDimitry Andric /* This header is distributed to simplify porting x86_64 code that 15a7dea167SDimitry Andric makes explicit use of Intel intrinsics to powerpc64le. 16a7dea167SDimitry Andric It is the user's responsibility to determine if the results are 17a7dea167SDimitry Andric acceptable and make additional changes as necessary. 18a7dea167SDimitry Andric Note that much code that uses Intel intrinsics can be rewritten in 19a7dea167SDimitry Andric standard C or GNU C extensions, which are more portable and better 20a7dea167SDimitry Andric optimized across multiple targets. 21a7dea167SDimitry Andric 22a7dea167SDimitry Andric In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA 23a7dea167SDimitry Andric is a good match for most SIMD operations. However the Horizontal 24a7dea167SDimitry Andric add/sub requires the data pairs be permuted into a separate 25a7dea167SDimitry Andric registers with vertical even/odd alignment for the operation. 26a7dea167SDimitry Andric And the addsub operation requires the sign of only the even numbered 27a7dea167SDimitry Andric elements be flipped (xored with -0.0). 28a7dea167SDimitry Andric For larger blocks of code using these intrinsic implementations, 29a7dea167SDimitry Andric the compiler be should be able to schedule instructions to avoid 30a7dea167SDimitry Andric additional latency. 31a7dea167SDimitry Andric 32a7dea167SDimitry Andric In the specific case of the monitor and mwait instructions there are 33a7dea167SDimitry Andric no direct equivalent in the PowerISA at this time. So those 34a7dea167SDimitry Andric intrinsics are not implemented. */ 3581ad6265SDimitry Andric #error \ 3681ad6265SDimitry Andric "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." 37a7dea167SDimitry Andric #endif 38a7dea167SDimitry Andric 39a7dea167SDimitry Andric #ifndef PMMINTRIN_H_ 40a7dea167SDimitry Andric #define PMMINTRIN_H_ 41a7dea167SDimitry Andric 42*bdd1243dSDimitry Andric #if defined(__powerpc64__) && \ 43fcaf7f86SDimitry Andric (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) 44a7dea167SDimitry Andric 45a7dea167SDimitry Andric /* We need definitions from the SSE2 and SSE header files*/ 46a7dea167SDimitry Andric #include <emmintrin.h> 47a7dea167SDimitry Andric 4881ad6265SDimitry Andric extern __inline __m128 4981ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_addsub_ps(__m128 __X,__m128 __Y)5081ad6265SDimitry Andric _mm_addsub_ps(__m128 __X, __m128 __Y) { 5181ad6265SDimitry Andric const __v4sf __even_n0 = {-0.0, 0.0, -0.0, 0.0}; 5281ad6265SDimitry Andric __v4sf __even_neg_Y = vec_xor(__Y, __even_n0); 5381ad6265SDimitry Andric return (__m128)vec_add(__X, __even_neg_Y); 54a7dea167SDimitry Andric } 55a7dea167SDimitry Andric 5681ad6265SDimitry Andric extern __inline __m128d 5781ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_addsub_pd(__m128d __X,__m128d __Y)5881ad6265SDimitry Andric _mm_addsub_pd(__m128d __X, __m128d __Y) { 5981ad6265SDimitry Andric const __v2df __even_n0 = {-0.0, 0.0}; 6081ad6265SDimitry Andric __v2df __even_neg_Y = vec_xor(__Y, __even_n0); 6181ad6265SDimitry Andric return (__m128d)vec_add(__X, __even_neg_Y); 62a7dea167SDimitry Andric } 63a7dea167SDimitry Andric 6481ad6265SDimitry Andric extern __inline __m128 6581ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_hadd_ps(__m128 __X,__m128 __Y)6681ad6265SDimitry Andric _mm_hadd_ps(__m128 __X, __m128 __Y) { 6781ad6265SDimitry Andric __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 6881ad6265SDimitry Andric 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 6981ad6265SDimitry Andric 0x18, 0x19, 0x1A, 0x1B}; 7081ad6265SDimitry Andric __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 7181ad6265SDimitry Andric 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 7281ad6265SDimitry Andric 0x1C, 0x1D, 0x1E, 0x1F}; 7381ad6265SDimitry Andric return (__m128)vec_add(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2), 7481ad6265SDimitry Andric vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1)); 75a7dea167SDimitry Andric } 76a7dea167SDimitry Andric 7781ad6265SDimitry Andric extern __inline __m128 7881ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_hsub_ps(__m128 __X,__m128 __Y)7981ad6265SDimitry Andric _mm_hsub_ps(__m128 __X, __m128 __Y) { 8081ad6265SDimitry Andric __vector unsigned char __xform2 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 8181ad6265SDimitry Andric 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 8281ad6265SDimitry Andric 0x18, 0x19, 0x1A, 0x1B}; 8381ad6265SDimitry Andric __vector unsigned char __xform1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 8481ad6265SDimitry Andric 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 8581ad6265SDimitry Andric 0x1C, 0x1D, 0x1E, 0x1F}; 8681ad6265SDimitry Andric return (__m128)vec_sub(vec_perm((__v4sf)__X, (__v4sf)__Y, __xform2), 8781ad6265SDimitry Andric vec_perm((__v4sf)__X, (__v4sf)__Y, __xform1)); 88a7dea167SDimitry Andric } 89a7dea167SDimitry Andric 9081ad6265SDimitry Andric extern __inline __m128d 9181ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_hadd_pd(__m128d __X,__m128d __Y)9281ad6265SDimitry Andric _mm_hadd_pd(__m128d __X, __m128d __Y) { 93a7dea167SDimitry Andric return (__m128d)vec_add(vec_mergeh((__v2df)__X, (__v2df)__Y), 94a7dea167SDimitry Andric vec_mergel((__v2df)__X, (__v2df)__Y)); 95a7dea167SDimitry Andric } 96a7dea167SDimitry Andric 9781ad6265SDimitry Andric extern __inline __m128d 9881ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_hsub_pd(__m128d __X,__m128d __Y)9981ad6265SDimitry Andric _mm_hsub_pd(__m128d __X, __m128d __Y) { 100a7dea167SDimitry Andric return (__m128d)vec_sub(vec_mergeh((__v2df)__X, (__v2df)__Y), 101a7dea167SDimitry Andric vec_mergel((__v2df)__X, (__v2df)__Y)); 102a7dea167SDimitry Andric } 103a7dea167SDimitry Andric 10481ad6265SDimitry Andric #ifdef _ARCH_PWR8 10581ad6265SDimitry Andric extern __inline __m128 10681ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movehdup_ps(__m128 __X)10781ad6265SDimitry Andric _mm_movehdup_ps(__m128 __X) { 108a7dea167SDimitry Andric return (__m128)vec_mergeo((__v4su)__X, (__v4su)__X); 109a7dea167SDimitry Andric } 11081ad6265SDimitry Andric #endif 111a7dea167SDimitry Andric 11281ad6265SDimitry Andric #ifdef _ARCH_PWR8 11381ad6265SDimitry Andric extern __inline __m128 11481ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_moveldup_ps(__m128 __X)11581ad6265SDimitry Andric _mm_moveldup_ps(__m128 __X) { 116a7dea167SDimitry Andric return (__m128)vec_mergee((__v4su)__X, (__v4su)__X); 117a7dea167SDimitry Andric } 11881ad6265SDimitry Andric #endif 119a7dea167SDimitry Andric 12081ad6265SDimitry Andric extern __inline __m128d 12181ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_loaddup_pd(double const * __P)12281ad6265SDimitry Andric _mm_loaddup_pd(double const *__P) { 123a7dea167SDimitry Andric return (__m128d)vec_splats(*__P); 124a7dea167SDimitry Andric } 125a7dea167SDimitry Andric 12681ad6265SDimitry Andric extern __inline __m128d 12781ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movedup_pd(__m128d __X)12881ad6265SDimitry Andric _mm_movedup_pd(__m128d __X) { 129a7dea167SDimitry Andric return _mm_shuffle_pd(__X, __X, _MM_SHUFFLE2(0, 0)); 130a7dea167SDimitry Andric } 131a7dea167SDimitry Andric 13281ad6265SDimitry Andric extern __inline __m128i 13381ad6265SDimitry Andric __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_lddqu_si128(__m128i const * __P)13481ad6265SDimitry Andric _mm_lddqu_si128(__m128i const *__P) { 135a7dea167SDimitry Andric return (__m128i)(vec_vsx_ld(0, (signed int const *)__P)); 136a7dea167SDimitry Andric } 137a7dea167SDimitry Andric 138a7dea167SDimitry Andric /* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ 139a7dea167SDimitry Andric 140a7dea167SDimitry Andric #else 141a7dea167SDimitry Andric #include_next <pmmintrin.h> 142*bdd1243dSDimitry Andric #endif /* defined(__powerpc64__) && \ 143fcaf7f86SDimitry Andric * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ 144a7dea167SDimitry Andric 145a7dea167SDimitry Andric #endif /* PMMINTRIN_H_ */ 146