xref: /freebsd/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/emmintrin.h (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19    However scalar float operations in vector (XMM) registers require
20    the POWER8 VSX ISA (2.07) level. There are differences for data
21    format and placement of float scalars in the vector register, which
22    require extra steps to match SSE2 scalar float semantics on POWER.
23 
24    It should be noted that there's much difference between X86_64's
25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26    portable <fenv.h> instead of access MXSCR directly.
27 
28    Most SSE2 scalar float intrinsic operations can be performed more
29    efficiently as C language float scalar operations or optimized to
30    use vector SIMD operations. We recommend this for new applications.
31 */
32 #error                                                                         \
33     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
34 #endif
35 
36 #ifndef EMMINTRIN_H_
37 #define EMMINTRIN_H_
38 
39 #if defined(__powerpc64__) &&                                                  \
40     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
41 
42 #include <altivec.h>
43 
44 /* We need definitions from the SSE header files.  */
45 #include <xmmintrin.h>
46 
47 /* SSE2 */
48 typedef __vector double __v2df;
49 typedef __vector float __v4f;
50 typedef __vector long long __v2di;
51 typedef __vector unsigned long long __v2du;
52 typedef __vector int __v4si;
53 typedef __vector unsigned int __v4su;
54 typedef __vector short __v8hi;
55 typedef __vector unsigned short __v8hu;
56 typedef __vector signed char __v16qi;
57 typedef __vector unsigned char __v16qu;
58 
59 /* The Intel API is flexible enough that we must allow aliasing with other
60    vector types, and their scalar components.  */
61 typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
62 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
63 
64 /* Unaligned version of the same types.  */
65 typedef long long __m128i_u
66     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
67 typedef double __m128d_u
68     __attribute__((__vector_size__(16), __may_alias__, __aligned__(1)));
69 
70 /* Define two value permute mask.  */
71 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
72 
73 /* Create a vector with element 0 as F and the rest zero.  */
74 extern __inline __m128d
75     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_sd(double __F)76     _mm_set_sd(double __F) {
77   return __extension__(__m128d){__F, 0.0};
78 }
79 
80 /* Create a vector with both elements equal to F.  */
81 extern __inline __m128d
82     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_pd(double __F)83     _mm_set1_pd(double __F) {
84   return __extension__(__m128d){__F, __F};
85 }
86 
87 extern __inline __m128d
88     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd1(double __F)89     _mm_set_pd1(double __F) {
90   return _mm_set1_pd(__F);
91 }
92 
93 /* Create a vector with the lower value X and upper value W.  */
94 extern __inline __m128d
95     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_pd(double __W,double __X)96     _mm_set_pd(double __W, double __X) {
97   return __extension__(__m128d){__X, __W};
98 }
99 
100 /* Create a vector with the lower value W and upper value X.  */
101 extern __inline __m128d
102     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_pd(double __W,double __X)103     _mm_setr_pd(double __W, double __X) {
104   return __extension__(__m128d){__W, __X};
105 }
106 
107 /* Create an undefined vector.  */
108 extern __inline __m128d
109     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_pd(void)110     _mm_undefined_pd(void) {
111   __m128d __Y = __Y;
112   return __Y;
113 }
114 
115 /* Create a vector of zeros.  */
116 extern __inline __m128d
117     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_pd(void)118     _mm_setzero_pd(void) {
119   return (__m128d)vec_splats(0);
120 }
121 
122 /* Sets the low DPFP value of A from the low value of B.  */
123 extern __inline __m128d
124     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_sd(__m128d __A,__m128d __B)125     _mm_move_sd(__m128d __A, __m128d __B) {
126   __v2df __result = (__v2df)__A;
127   __result[0] = ((__v2df)__B)[0];
128   return (__m128d)__result;
129 }
130 
131 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
132 extern __inline __m128d
133     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd(double const * __P)134     _mm_load_pd(double const *__P) {
135   return ((__m128d)vec_ld(0, (__v16qu *)__P));
136 }
137 
138 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
139 extern __inline __m128d
140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_pd(double const * __P)141     _mm_loadu_pd(double const *__P) {
142   return (vec_vsx_ld(0, __P));
143 }
144 
145 /* Create a vector with all two elements equal to *P.  */
146 extern __inline __m128d
147     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load1_pd(double const * __P)148     _mm_load1_pd(double const *__P) {
149   return (vec_splats(*__P));
150 }
151 
152 /* Create a vector with element 0 as *P and the rest zero.  */
153 extern __inline __m128d
154     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_sd(double const * __P)155     _mm_load_sd(double const *__P) {
156   return _mm_set_sd(*__P);
157 }
158 
159 extern __inline __m128d
160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_pd1(double const * __P)161     _mm_load_pd1(double const *__P) {
162   return _mm_load1_pd(__P);
163 }
164 
165 /* Load two DPFP values in reverse order.  The address must be aligned.  */
166 extern __inline __m128d
167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadr_pd(double const * __P)168     _mm_loadr_pd(double const *__P) {
169   __v2df __tmp = _mm_load_pd(__P);
170   return (__m128d)vec_xxpermdi(__tmp, __tmp, 2);
171 }
172 
173 /* Store two DPFP values.  The address must be 16-byte aligned.  */
174 extern __inline void
175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd(double * __P,__m128d __A)176     _mm_store_pd(double *__P, __m128d __A) {
177   vec_st((__v16qu)__A, 0, (__v16qu *)__P);
178 }
179 
180 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
181 extern __inline void
182     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_pd(double * __P,__m128d __A)183     _mm_storeu_pd(double *__P, __m128d __A) {
184   *(__m128d_u *)__P = __A;
185 }
186 
187 /* Stores the lower DPFP value.  */
188 extern __inline void
189     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_sd(double * __P,__m128d __A)190     _mm_store_sd(double *__P, __m128d __A) {
191   *__P = ((__v2df)__A)[0];
192 }
193 
194 extern __inline double
195     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_f64(__m128d __A)196     _mm_cvtsd_f64(__m128d __A) {
197   return ((__v2df)__A)[0];
198 }
199 
200 extern __inline void
201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_pd(double * __P,__m128d __A)202     _mm_storel_pd(double *__P, __m128d __A) {
203   _mm_store_sd(__P, __A);
204 }
205 
206 /* Stores the upper DPFP value.  */
207 extern __inline void
208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeh_pd(double * __P,__m128d __A)209     _mm_storeh_pd(double *__P, __m128d __A) {
210   *__P = ((__v2df)__A)[1];
211 }
212 /* Store the lower DPFP value across two words.
213    The address must be 16-byte aligned.  */
214 extern __inline void
215     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store1_pd(double * __P,__m128d __A)216     _mm_store1_pd(double *__P, __m128d __A) {
217   _mm_store_pd(__P, vec_splat(__A, 0));
218 }
219 
220 extern __inline void
221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_pd1(double * __P,__m128d __A)222     _mm_store_pd1(double *__P, __m128d __A) {
223   _mm_store1_pd(__P, __A);
224 }
225 
226 /* Store two DPFP values in reverse order.  The address must be aligned.  */
227 extern __inline void
228     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storer_pd(double * __P,__m128d __A)229     _mm_storer_pd(double *__P, __m128d __A) {
230   _mm_store_pd(__P, vec_xxpermdi(__A, __A, 2));
231 }
232 
233 /* Intel intrinsic.  */
234 extern __inline long long
235     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64(__m128i __A)236     _mm_cvtsi128_si64(__m128i __A) {
237   return ((__v2di)__A)[0];
238 }
239 
240 /* Microsoft intrinsic.  */
241 extern __inline long long
242     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si64x(__m128i __A)243     _mm_cvtsi128_si64x(__m128i __A) {
244   return ((__v2di)__A)[0];
245 }
246 
247 extern __inline __m128d
248     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_pd(__m128d __A,__m128d __B)249     _mm_add_pd(__m128d __A, __m128d __B) {
250   return (__m128d)((__v2df)__A + (__v2df)__B);
251 }
252 
253 /* Add the lower double-precision (64-bit) floating-point element in
254    a and b, store the result in the lower element of dst, and copy
255    the upper element from a to the upper element of dst. */
256 extern __inline __m128d
257     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_sd(__m128d __A,__m128d __B)258     _mm_add_sd(__m128d __A, __m128d __B) {
259   __A[0] = __A[0] + __B[0];
260   return (__A);
261 }
262 
263 extern __inline __m128d
264     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_pd(__m128d __A,__m128d __B)265     _mm_sub_pd(__m128d __A, __m128d __B) {
266   return (__m128d)((__v2df)__A - (__v2df)__B);
267 }
268 
269 extern __inline __m128d
270     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_sd(__m128d __A,__m128d __B)271     _mm_sub_sd(__m128d __A, __m128d __B) {
272   __A[0] = __A[0] - __B[0];
273   return (__A);
274 }
275 
276 extern __inline __m128d
277     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_pd(__m128d __A,__m128d __B)278     _mm_mul_pd(__m128d __A, __m128d __B) {
279   return (__m128d)((__v2df)__A * (__v2df)__B);
280 }
281 
282 extern __inline __m128d
283     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_sd(__m128d __A,__m128d __B)284     _mm_mul_sd(__m128d __A, __m128d __B) {
285   __A[0] = __A[0] * __B[0];
286   return (__A);
287 }
288 
289 extern __inline __m128d
290     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_pd(__m128d __A,__m128d __B)291     _mm_div_pd(__m128d __A, __m128d __B) {
292   return (__m128d)((__v2df)__A / (__v2df)__B);
293 }
294 
295 extern __inline __m128d
296     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_div_sd(__m128d __A,__m128d __B)297     _mm_div_sd(__m128d __A, __m128d __B) {
298   __A[0] = __A[0] / __B[0];
299   return (__A);
300 }
301 
302 extern __inline __m128d
303     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_pd(__m128d __A)304     _mm_sqrt_pd(__m128d __A) {
305   return (vec_sqrt(__A));
306 }
307 
308 /* Return pair {sqrt (B[0]), A[1]}.  */
309 extern __inline __m128d
310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sqrt_sd(__m128d __A,__m128d __B)311     _mm_sqrt_sd(__m128d __A, __m128d __B) {
312   __v2df __c;
313   __c = vec_sqrt((__v2df)_mm_set1_pd(__B[0]));
314   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
315 }
316 
317 extern __inline __m128d
318     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_pd(__m128d __A,__m128d __B)319     _mm_min_pd(__m128d __A, __m128d __B) {
320   return (vec_min(__A, __B));
321 }
322 
323 extern __inline __m128d
324     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_sd(__m128d __A,__m128d __B)325     _mm_min_sd(__m128d __A, __m128d __B) {
326   __v2df __a, __b, __c;
327   __a = vec_splats(__A[0]);
328   __b = vec_splats(__B[0]);
329   __c = vec_min(__a, __b);
330   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
331 }
332 
333 extern __inline __m128d
334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_pd(__m128d __A,__m128d __B)335     _mm_max_pd(__m128d __A, __m128d __B) {
336   return (vec_max(__A, __B));
337 }
338 
339 extern __inline __m128d
340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_sd(__m128d __A,__m128d __B)341     _mm_max_sd(__m128d __A, __m128d __B) {
342   __v2df __a, __b, __c;
343   __a = vec_splats(__A[0]);
344   __b = vec_splats(__B[0]);
345   __c = vec_max(__a, __b);
346   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
347 }
348 
349 extern __inline __m128d
350     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_pd(__m128d __A,__m128d __B)351     _mm_cmpeq_pd(__m128d __A, __m128d __B) {
352   return ((__m128d)vec_cmpeq((__v2df)__A, (__v2df)__B));
353 }
354 
355 extern __inline __m128d
356     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_pd(__m128d __A,__m128d __B)357     _mm_cmplt_pd(__m128d __A, __m128d __B) {
358   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
359 }
360 
361 extern __inline __m128d
362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_pd(__m128d __A,__m128d __B)363     _mm_cmple_pd(__m128d __A, __m128d __B) {
364   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
365 }
366 
367 extern __inline __m128d
368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_pd(__m128d __A,__m128d __B)369     _mm_cmpgt_pd(__m128d __A, __m128d __B) {
370   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
371 }
372 
373 extern __inline __m128d
374     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_pd(__m128d __A,__m128d __B)375     _mm_cmpge_pd(__m128d __A, __m128d __B) {
376   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
377 }
378 
379 extern __inline __m128d
380     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_pd(__m128d __A,__m128d __B)381     _mm_cmpneq_pd(__m128d __A, __m128d __B) {
382   __v2df __temp = (__v2df)vec_cmpeq((__v2df)__A, (__v2df)__B);
383   return ((__m128d)vec_nor(__temp, __temp));
384 }
385 
386 extern __inline __m128d
387     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_pd(__m128d __A,__m128d __B)388     _mm_cmpnlt_pd(__m128d __A, __m128d __B) {
389   return ((__m128d)vec_cmpge((__v2df)__A, (__v2df)__B));
390 }
391 
392 extern __inline __m128d
393     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_pd(__m128d __A,__m128d __B)394     _mm_cmpnle_pd(__m128d __A, __m128d __B) {
395   return ((__m128d)vec_cmpgt((__v2df)__A, (__v2df)__B));
396 }
397 
398 extern __inline __m128d
399     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_pd(__m128d __A,__m128d __B)400     _mm_cmpngt_pd(__m128d __A, __m128d __B) {
401   return ((__m128d)vec_cmple((__v2df)__A, (__v2df)__B));
402 }
403 
404 extern __inline __m128d
405     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_pd(__m128d __A,__m128d __B)406     _mm_cmpnge_pd(__m128d __A, __m128d __B) {
407   return ((__m128d)vec_cmplt((__v2df)__A, (__v2df)__B));
408 }
409 
410 extern __inline __m128d
411     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd(__m128d __A,__m128d __B)412     _mm_cmpord_pd(__m128d __A, __m128d __B) {
413   __v2du __c, __d;
414   /* Compare against self will return false (0's) if NAN.  */
415   __c = (__v2du)vec_cmpeq(__A, __A);
416   __d = (__v2du)vec_cmpeq(__B, __B);
417   /* A != NAN and B != NAN.  */
418   return ((__m128d)vec_and(__c, __d));
419 }
420 
421 extern __inline __m128d
422     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_pd(__m128d __A,__m128d __B)423     _mm_cmpunord_pd(__m128d __A, __m128d __B) {
424 #if _ARCH_PWR8
425   __v2du __c, __d;
426   /* Compare against self will return false (0's) if NAN.  */
427   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
428   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
429   /* A == NAN OR B == NAN converts too:
430      NOT(A != NAN) OR NOT(B != NAN).  */
431   __c = vec_nor(__c, __c);
432   return ((__m128d)vec_orc(__c, __d));
433 #else
434   __v2du __c, __d;
435   /* Compare against self will return false (0's) if NAN.  */
436   __c = (__v2du)vec_cmpeq((__v2df)__A, (__v2df)__A);
437   __d = (__v2du)vec_cmpeq((__v2df)__B, (__v2df)__B);
438   /* Convert the true ('1's) is NAN.  */
439   __c = vec_nor(__c, __c);
440   __d = vec_nor(__d, __d);
441   return ((__m128d)vec_or(__c, __d));
442 #endif
443 }
444 
445 extern __inline __m128d
446     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_sd(__m128d __A,__m128d __B)447     _mm_cmpeq_sd(__m128d __A, __m128d __B) {
448   __v2df __a, __b, __c;
449   /* PowerISA VSX does not allow partial (for just lower double)
450      results. So to insure we don't generate spurious exceptions
451      (from the upper double values) we splat the lower double
452      before we do the operation. */
453   __a = vec_splats(__A[0]);
454   __b = vec_splats(__B[0]);
455   __c = (__v2df)vec_cmpeq(__a, __b);
456   /* Then we merge the lower double result with the original upper
457      double from __A.  */
458   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
459 }
460 
461 extern __inline __m128d
462     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_sd(__m128d __A,__m128d __B)463     _mm_cmplt_sd(__m128d __A, __m128d __B) {
464   __v2df __a, __b, __c;
465   __a = vec_splats(__A[0]);
466   __b = vec_splats(__B[0]);
467   __c = (__v2df)vec_cmplt(__a, __b);
468   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
469 }
470 
471 extern __inline __m128d
472     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmple_sd(__m128d __A,__m128d __B)473     _mm_cmple_sd(__m128d __A, __m128d __B) {
474   __v2df __a, __b, __c;
475   __a = vec_splats(__A[0]);
476   __b = vec_splats(__B[0]);
477   __c = (__v2df)vec_cmple(__a, __b);
478   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
479 }
480 
481 extern __inline __m128d
482     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_sd(__m128d __A,__m128d __B)483     _mm_cmpgt_sd(__m128d __A, __m128d __B) {
484   __v2df __a, __b, __c;
485   __a = vec_splats(__A[0]);
486   __b = vec_splats(__B[0]);
487   __c = (__v2df)vec_cmpgt(__a, __b);
488   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
489 }
490 
491 extern __inline __m128d
492     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpge_sd(__m128d __A,__m128d __B)493     _mm_cmpge_sd(__m128d __A, __m128d __B) {
494   __v2df __a, __b, __c;
495   __a = vec_splats(__A[0]);
496   __b = vec_splats(__B[0]);
497   __c = (__v2df)vec_cmpge(__a, __b);
498   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
499 }
500 
501 extern __inline __m128d
502     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpneq_sd(__m128d __A,__m128d __B)503     _mm_cmpneq_sd(__m128d __A, __m128d __B) {
504   __v2df __a, __b, __c;
505   __a = vec_splats(__A[0]);
506   __b = vec_splats(__B[0]);
507   __c = (__v2df)vec_cmpeq(__a, __b);
508   __c = vec_nor(__c, __c);
509   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
510 }
511 
512 extern __inline __m128d
513     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnlt_sd(__m128d __A,__m128d __B)514     _mm_cmpnlt_sd(__m128d __A, __m128d __B) {
515   __v2df __a, __b, __c;
516   __a = vec_splats(__A[0]);
517   __b = vec_splats(__B[0]);
518   /* Not less than is just greater than or equal.  */
519   __c = (__v2df)vec_cmpge(__a, __b);
520   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
521 }
522 
523 extern __inline __m128d
524     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnle_sd(__m128d __A,__m128d __B)525     _mm_cmpnle_sd(__m128d __A, __m128d __B) {
526   __v2df __a, __b, __c;
527   __a = vec_splats(__A[0]);
528   __b = vec_splats(__B[0]);
529   /* Not less than or equal is just greater than.  */
530   __c = (__v2df)vec_cmpge(__a, __b);
531   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
532 }
533 
534 extern __inline __m128d
535     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpngt_sd(__m128d __A,__m128d __B)536     _mm_cmpngt_sd(__m128d __A, __m128d __B) {
537   __v2df __a, __b, __c;
538   __a = vec_splats(__A[0]);
539   __b = vec_splats(__B[0]);
540   /* Not greater than is just less than or equal.  */
541   __c = (__v2df)vec_cmple(__a, __b);
542   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
543 }
544 
545 extern __inline __m128d
546     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpnge_sd(__m128d __A,__m128d __B)547     _mm_cmpnge_sd(__m128d __A, __m128d __B) {
548   __v2df __a, __b, __c;
549   __a = vec_splats(__A[0]);
550   __b = vec_splats(__B[0]);
551   /* Not greater than or equal is just less than.  */
552   __c = (__v2df)vec_cmplt(__a, __b);
553   return (__m128d)_mm_setr_pd(__c[0], __A[1]);
554 }
555 
556 extern __inline __m128d
557     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_sd(__m128d __A,__m128d __B)558     _mm_cmpord_sd(__m128d __A, __m128d __B) {
559   __v2df __r;
560   __r = (__v2df)_mm_cmpord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
561   return (__m128d)_mm_setr_pd(__r[0], ((__v2df)__A)[1]);
562 }
563 
564 extern __inline __m128d
565     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpunord_sd(__m128d __A,__m128d __B)566     _mm_cmpunord_sd(__m128d __A, __m128d __B) {
567   __v2df __r;
568   __r = _mm_cmpunord_pd(vec_splats(__A[0]), vec_splats(__B[0]));
569   return (__m128d)_mm_setr_pd(__r[0], __A[1]);
570 }
571 
572 /* FIXME
573    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
574    exactly the same because GCC for PowerPC only generates unordered
575    compares (scalar and vector).
576    Technically __mm_comieq_sp et all should be using the ordered
577    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
578    be OK.   */
579 extern __inline int
580     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comieq_sd(__m128d __A,__m128d __B)581     _mm_comieq_sd(__m128d __A, __m128d __B) {
582   return (__A[0] == __B[0]);
583 }
584 
585 extern __inline int
586     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comilt_sd(__m128d __A,__m128d __B)587     _mm_comilt_sd(__m128d __A, __m128d __B) {
588   return (__A[0] < __B[0]);
589 }
590 
591 extern __inline int
592     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comile_sd(__m128d __A,__m128d __B)593     _mm_comile_sd(__m128d __A, __m128d __B) {
594   return (__A[0] <= __B[0]);
595 }
596 
597 extern __inline int
598     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comigt_sd(__m128d __A,__m128d __B)599     _mm_comigt_sd(__m128d __A, __m128d __B) {
600   return (__A[0] > __B[0]);
601 }
602 
603 extern __inline int
604     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comige_sd(__m128d __A,__m128d __B)605     _mm_comige_sd(__m128d __A, __m128d __B) {
606   return (__A[0] >= __B[0]);
607 }
608 
609 extern __inline int
610     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_comineq_sd(__m128d __A,__m128d __B)611     _mm_comineq_sd(__m128d __A, __m128d __B) {
612   return (__A[0] != __B[0]);
613 }
614 
615 extern __inline int
616     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomieq_sd(__m128d __A,__m128d __B)617     _mm_ucomieq_sd(__m128d __A, __m128d __B) {
618   return (__A[0] == __B[0]);
619 }
620 
621 extern __inline int
622     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomilt_sd(__m128d __A,__m128d __B)623     _mm_ucomilt_sd(__m128d __A, __m128d __B) {
624   return (__A[0] < __B[0]);
625 }
626 
627 extern __inline int
628     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomile_sd(__m128d __A,__m128d __B)629     _mm_ucomile_sd(__m128d __A, __m128d __B) {
630   return (__A[0] <= __B[0]);
631 }
632 
633 extern __inline int
634     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomigt_sd(__m128d __A,__m128d __B)635     _mm_ucomigt_sd(__m128d __A, __m128d __B) {
636   return (__A[0] > __B[0]);
637 }
638 
639 extern __inline int
640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomige_sd(__m128d __A,__m128d __B)641     _mm_ucomige_sd(__m128d __A, __m128d __B) {
642   return (__A[0] >= __B[0]);
643 }
644 
645 extern __inline int
646     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_ucomineq_sd(__m128d __A,__m128d __B)647     _mm_ucomineq_sd(__m128d __A, __m128d __B) {
648   return (__A[0] != __B[0]);
649 }
650 
651 /* Create a vector of Qi, where i is the element number.  */
652 extern __inline __m128i
653     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64x(long long __q1,long long __q0)654     _mm_set_epi64x(long long __q1, long long __q0) {
655   return __extension__(__m128i)(__v2di){__q0, __q1};
656 }
657 
658 extern __inline __m128i
659     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi64(__m64 __q1,__m64 __q0)660     _mm_set_epi64(__m64 __q1, __m64 __q0) {
661   return _mm_set_epi64x((long long)__q1, (long long)__q0);
662 }
663 
664 extern __inline __m128i
665     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi32(int __q3,int __q2,int __q1,int __q0)666     _mm_set_epi32(int __q3, int __q2, int __q1, int __q0) {
667   return __extension__(__m128i)(__v4si){__q0, __q1, __q2, __q3};
668 }
669 
670 extern __inline __m128i
671     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi16(short __q7,short __q6,short __q5,short __q4,short __q3,short __q2,short __q1,short __q0)672     _mm_set_epi16(short __q7, short __q6, short __q5, short __q4, short __q3,
673                   short __q2, short __q1, short __q0) {
674   return __extension__(__m128i)(__v8hi){__q0, __q1, __q2, __q3,
675                                         __q4, __q5, __q6, __q7};
676 }
677 
678 extern __inline __m128i
679     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set_epi8(char __q15,char __q14,char __q13,char __q12,char __q11,char __q10,char __q09,char __q08,char __q07,char __q06,char __q05,char __q04,char __q03,char __q02,char __q01,char __q00)680     _mm_set_epi8(char __q15, char __q14, char __q13, char __q12, char __q11,
681                  char __q10, char __q09, char __q08, char __q07, char __q06,
682                  char __q05, char __q04, char __q03, char __q02, char __q01,
683                  char __q00) {
684   return __extension__(__m128i)(__v16qi){
685       __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
686       __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15};
687 }
688 
689 /* Set all of the elements of the vector to A.  */
690 extern __inline __m128i
691     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64x(long long __A)692     _mm_set1_epi64x(long long __A) {
693   return _mm_set_epi64x(__A, __A);
694 }
695 
696 extern __inline __m128i
697     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi64(__m64 __A)698     _mm_set1_epi64(__m64 __A) {
699   return _mm_set_epi64(__A, __A);
700 }
701 
702 extern __inline __m128i
703     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi32(int __A)704     _mm_set1_epi32(int __A) {
705   return _mm_set_epi32(__A, __A, __A, __A);
706 }
707 
708 extern __inline __m128i
709     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi16(short __A)710     _mm_set1_epi16(short __A) {
711   return _mm_set_epi16(__A, __A, __A, __A, __A, __A, __A, __A);
712 }
713 
714 extern __inline __m128i
715     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_set1_epi8(char __A)716     _mm_set1_epi8(char __A) {
717   return _mm_set_epi8(__A, __A, __A, __A, __A, __A, __A, __A, __A, __A, __A,
718                       __A, __A, __A, __A, __A);
719 }
720 
721 /* Create a vector of Qi, where i is the element number.
722    The parameter order is reversed from the _mm_set_epi* functions.  */
723 extern __inline __m128i
724     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi64(__m64 __q0,__m64 __q1)725     _mm_setr_epi64(__m64 __q0, __m64 __q1) {
726   return _mm_set_epi64(__q1, __q0);
727 }
728 
729 extern __inline __m128i
730     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi32(int __q0,int __q1,int __q2,int __q3)731     _mm_setr_epi32(int __q0, int __q1, int __q2, int __q3) {
732   return _mm_set_epi32(__q3, __q2, __q1, __q0);
733 }
734 
735 extern __inline __m128i
736     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi16(short __q0,short __q1,short __q2,short __q3,short __q4,short __q5,short __q6,short __q7)737     _mm_setr_epi16(short __q0, short __q1, short __q2, short __q3, short __q4,
738                    short __q5, short __q6, short __q7) {
739   return _mm_set_epi16(__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
740 }
741 
742 extern __inline __m128i
743     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setr_epi8(char __q00,char __q01,char __q02,char __q03,char __q04,char __q05,char __q06,char __q07,char __q08,char __q09,char __q10,char __q11,char __q12,char __q13,char __q14,char __q15)744     _mm_setr_epi8(char __q00, char __q01, char __q02, char __q03, char __q04,
745                   char __q05, char __q06, char __q07, char __q08, char __q09,
746                   char __q10, char __q11, char __q12, char __q13, char __q14,
747                   char __q15) {
748   return _mm_set_epi8(__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
749                       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
750 }
751 
752 /* Create a vector with element 0 as *P and the rest zero.  */
753 extern __inline __m128i
754     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_load_si128(__m128i const * __P)755     _mm_load_si128(__m128i const *__P) {
756   return *__P;
757 }
758 
759 extern __inline __m128i
760     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadu_si128(__m128i_u const * __P)761     _mm_loadu_si128(__m128i_u const *__P) {
762   return (__m128i)(vec_vsx_ld(0, (signed int const *)__P));
763 }
764 
765 extern __inline __m128i
766     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_epi64(__m128i_u const * __P)767     _mm_loadl_epi64(__m128i_u const *__P) {
768   return _mm_set_epi64((__m64)0LL, *(__m64 *)__P);
769 }
770 
771 extern __inline void
772     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_store_si128(__m128i * __P,__m128i __B)773     _mm_store_si128(__m128i *__P, __m128i __B) {
774   vec_st((__v16qu)__B, 0, (__v16qu *)__P);
775 }
776 
777 extern __inline void
778     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storeu_si128(__m128i_u * __P,__m128i __B)779     _mm_storeu_si128(__m128i_u *__P, __m128i __B) {
780   *__P = __B;
781 }
782 
783 extern __inline void
784     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_storel_epi64(__m128i_u * __P,__m128i __B)785     _mm_storel_epi64(__m128i_u *__P, __m128i __B) {
786   *(long long *)__P = ((__v2di)__B)[0];
787 }
788 
789 extern __inline __m64
790     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movepi64_pi64(__m128i_u __B)791     _mm_movepi64_pi64(__m128i_u __B) {
792   return (__m64)((__v2di)__B)[0];
793 }
794 
795 extern __inline __m128i
796     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movpi64_epi64(__m64 __A)797     _mm_movpi64_epi64(__m64 __A) {
798   return _mm_set_epi64((__m64)0LL, __A);
799 }
800 
801 extern __inline __m128i
802     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_move_epi64(__m128i __A)803     _mm_move_epi64(__m128i __A) {
804   return _mm_set_epi64((__m64)0LL, (__m64)__A[0]);
805 }
806 
807 /* Create an undefined vector.  */
808 extern __inline __m128i
809     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_undefined_si128(void)810     _mm_undefined_si128(void) {
811   __m128i __Y = __Y;
812   return __Y;
813 }
814 
815 /* Create a vector of zeros.  */
816 extern __inline __m128i
817     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_setzero_si128(void)818     _mm_setzero_si128(void) {
819   return __extension__(__m128i)(__v4si){0, 0, 0, 0};
820 }
821 
822 #ifdef _ARCH_PWR8
823 extern __inline __m128d
824     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_pd(__m128i __A)825     _mm_cvtepi32_pd(__m128i __A) {
826   __v2di __val;
827   /* For LE need to generate Vector Unpack Low Signed Word.
828      Which is generated from unpackh.  */
829   __val = (__v2di)vec_unpackh((__v4si)__A);
830 
831   return (__m128d)vec_ctf(__val, 0);
832 }
833 #endif
834 
835 extern __inline __m128
836     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtepi32_ps(__m128i __A)837     _mm_cvtepi32_ps(__m128i __A) {
838   return ((__m128)vec_ctf((__v4si)__A, 0));
839 }
840 
841 extern __inline __m128i
842     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_epi32(__m128d __A)843     _mm_cvtpd_epi32(__m128d __A) {
844   __v2df __rounded = vec_rint(__A);
845   __v4si __result, __temp;
846   const __v4si __vzero = {0, 0, 0, 0};
847 
848   /* VSX Vector truncate Double-Precision to integer and Convert to
849    Signed Integer Word format with Saturate.  */
850   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__rounded) :);
851 
852 #ifdef _ARCH_PWR8
853 #ifdef __LITTLE_ENDIAN__
854   __temp = vec_mergeo(__temp, __temp);
855 #else
856   __temp = vec_mergee(__temp, __temp);
857 #endif
858   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
859                                  (__vector long long)__vzero);
860 #else
861   {
862     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
863                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
864     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
865   }
866 #endif
867   return (__m128i)__result;
868 }
869 
870 extern __inline __m64
871     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_pi32(__m128d __A)872     _mm_cvtpd_pi32(__m128d __A) {
873   __m128i __result = _mm_cvtpd_epi32(__A);
874 
875   return (__m64)__result[0];
876 }
877 
878 extern __inline __m128
879     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpd_ps(__m128d __A)880     _mm_cvtpd_ps(__m128d __A) {
881   __v4sf __result;
882   __v4si __temp;
883   const __v4si __vzero = {0, 0, 0, 0};
884 
885   __asm__("xvcvdpsp %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
886 
887 #ifdef _ARCH_PWR8
888 #ifdef __LITTLE_ENDIAN__
889   __temp = vec_mergeo(__temp, __temp);
890 #else
891   __temp = vec_mergee(__temp, __temp);
892 #endif
893   __result = (__v4sf)vec_vpkudum((__vector long long)__temp,
894                                  (__vector long long)__vzero);
895 #else
896   {
897     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
898                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
899     __result = (__v4sf)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
900   }
901 #endif
902   return ((__m128)__result);
903 }
904 
905 extern __inline __m128i
906     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_epi32(__m128d __A)907     _mm_cvttpd_epi32(__m128d __A) {
908   __v4si __result;
909   __v4si __temp;
910   const __v4si __vzero = {0, 0, 0, 0};
911 
912   /* VSX Vector truncate Double-Precision to integer and Convert to
913    Signed Integer Word format with Saturate.  */
914   __asm__("xvcvdpsxws %x0,%x1" : "=wa"(__temp) : "wa"(__A) :);
915 
916 #ifdef _ARCH_PWR8
917 #ifdef __LITTLE_ENDIAN__
918   __temp = vec_mergeo(__temp, __temp);
919 #else
920   __temp = vec_mergee(__temp, __temp);
921 #endif
922   __result = (__v4si)vec_vpkudum((__vector long long)__temp,
923                                  (__vector long long)__vzero);
924 #else
925   {
926     const __v16qu __pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
927                               0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f};
928     __result = (__v4si)vec_perm((__v16qu)__temp, (__v16qu)__vzero, __pkperm);
929   }
930 #endif
931 
932   return ((__m128i)__result);
933 }
934 
935 extern __inline __m64
936     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttpd_pi32(__m128d __A)937     _mm_cvttpd_pi32(__m128d __A) {
938   __m128i __result = _mm_cvttpd_epi32(__A);
939 
940   return (__m64)__result[0];
941 }
942 
943 extern __inline int
944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi128_si32(__m128i __A)945     _mm_cvtsi128_si32(__m128i __A) {
946   return ((__v4si)__A)[0];
947 }
948 
949 #ifdef _ARCH_PWR8
950 extern __inline __m128d
951     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtpi32_pd(__m64 __A)952     _mm_cvtpi32_pd(__m64 __A) {
953   __v4si __temp;
954   __v2di __tmp2;
955   __v4f __result;
956 
957   __temp = (__v4si)vec_splats(__A);
958   __tmp2 = (__v2di)vec_unpackl(__temp);
959   __result = vec_ctf((__vector signed long long)__tmp2, 0);
960   return (__m128d)__result;
961 }
962 #endif
963 
964 extern __inline __m128i
965     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_epi32(__m128 __A)966     _mm_cvtps_epi32(__m128 __A) {
967   __v4sf __rounded;
968   __v4si __result;
969 
970   __rounded = vec_rint((__v4sf)__A);
971   __result = vec_cts(__rounded, 0);
972   return (__m128i)__result;
973 }
974 
975 extern __inline __m128i
976     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttps_epi32(__m128 __A)977     _mm_cvttps_epi32(__m128 __A) {
978   __v4si __result;
979 
980   __result = vec_cts((__v4sf)__A, 0);
981   return (__m128i)__result;
982 }
983 
984 extern __inline __m128d
985     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtps_pd(__m128 __A)986     _mm_cvtps_pd(__m128 __A) {
987   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
988 #ifdef vec_doubleh
989   return (__m128d)vec_doubleh((__v4sf)__A);
990 #else
991   /* Otherwise the compiler is not current and so need to generate the
992      equivalent code.  */
993   __v4sf __a = (__v4sf)__A;
994   __v4sf __temp;
995   __v2df __result;
996 #ifdef __LITTLE_ENDIAN__
997   /* The input float values are in elements {[0], [1]} but the convert
998      instruction needs them in elements {[1], [3]}, So we use two
999      shift left double vector word immediates to get the elements
1000      lined up.  */
1001   __temp = __builtin_vsx_xxsldwi(__a, __a, 3);
1002   __temp = __builtin_vsx_xxsldwi(__a, __temp, 2);
1003 #else
1004   /* The input float values are in elements {[0], [1]} but the convert
1005      instruction needs them in elements {[0], [2]}, So we use two
1006      shift left double vector word immediates to get the elements
1007      lined up.  */
1008   __temp = vec_vmrghw(__a, __a);
1009 #endif
1010   __asm__(" xvcvspdp %x0,%x1" : "=wa"(__result) : "wa"(__temp) :);
1011   return (__m128d)__result;
1012 #endif
1013 }
1014 
1015 extern __inline int
1016     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si32(__m128d __A)1017     _mm_cvtsd_si32(__m128d __A) {
1018   __v2df __rounded = vec_rint((__v2df)__A);
1019   int __result = ((__v2df)__rounded)[0];
1020 
1021   return __result;
1022 }
1023 /* Intel intrinsic.  */
1024 extern __inline long long
1025     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64(__m128d __A)1026     _mm_cvtsd_si64(__m128d __A) {
1027   __v2df __rounded = vec_rint((__v2df)__A);
1028   long long __result = ((__v2df)__rounded)[0];
1029 
1030   return __result;
1031 }
1032 
1033 /* Microsoft intrinsic.  */
1034 extern __inline long long
1035     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_si64x(__m128d __A)1036     _mm_cvtsd_si64x(__m128d __A) {
1037   return _mm_cvtsd_si64((__v2df)__A);
1038 }
1039 
1040 extern __inline int
1041     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si32(__m128d __A)1042     _mm_cvttsd_si32(__m128d __A) {
1043   int __result = ((__v2df)__A)[0];
1044 
1045   return __result;
1046 }
1047 
1048 /* Intel intrinsic.  */
1049 extern __inline long long
1050     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64(__m128d __A)1051     _mm_cvttsd_si64(__m128d __A) {
1052   long long __result = ((__v2df)__A)[0];
1053 
1054   return __result;
1055 }
1056 
1057 /* Microsoft intrinsic.  */
1058 extern __inline long long
1059     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvttsd_si64x(__m128d __A)1060     _mm_cvttsd_si64x(__m128d __A) {
1061   return _mm_cvttsd_si64(__A);
1062 }
1063 
1064 extern __inline __m128
1065     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsd_ss(__m128 __A,__m128d __B)1066     _mm_cvtsd_ss(__m128 __A, __m128d __B) {
1067   __v4sf __result = (__v4sf)__A;
1068 
1069 #ifdef __LITTLE_ENDIAN__
1070   __v4sf __temp_s;
1071   /* Copy double element[0] to element [1] for conversion.  */
1072   __v2df __temp_b = vec_splat((__v2df)__B, 0);
1073 
1074   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1075   __result = __builtin_vsx_xxsldwi(__result, __result, 3);
1076   /* Convert double to single float scalar in a vector.  */
1077   __asm__("xscvdpsp %x0,%x1" : "=wa"(__temp_s) : "wa"(__temp_b) :);
1078   /* Shift the resulting scalar into vector element [0].  */
1079   __result = __builtin_vsx_xxsldwi(__result, __temp_s, 1);
1080 #else
1081   __result[0] = ((__v2df)__B)[0];
1082 #endif
1083   return (__m128)__result;
1084 }
1085 
1086 extern __inline __m128d
1087     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_sd(__m128d __A,int __B)1088     _mm_cvtsi32_sd(__m128d __A, int __B) {
1089   __v2df __result = (__v2df)__A;
1090   double __db = __B;
1091   __result[0] = __db;
1092   return (__m128d)__result;
1093 }
1094 
1095 /* Intel intrinsic.  */
1096 extern __inline __m128d
1097     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_sd(__m128d __A,long long __B)1098     _mm_cvtsi64_sd(__m128d __A, long long __B) {
1099   __v2df __result = (__v2df)__A;
1100   double __db = __B;
1101   __result[0] = __db;
1102   return (__m128d)__result;
1103 }
1104 
1105 /* Microsoft intrinsic.  */
1106 extern __inline __m128d
1107     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_sd(__m128d __A,long long __B)1108     _mm_cvtsi64x_sd(__m128d __A, long long __B) {
1109   return _mm_cvtsi64_sd(__A, __B);
1110 }
1111 
1112 extern __inline __m128d
1113     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_sd(__m128d __A,__m128 __B)1114     _mm_cvtss_sd(__m128d __A, __m128 __B) {
1115 #ifdef __LITTLE_ENDIAN__
1116   /* Use splat to move element [0] into position for the convert. */
1117   __v4sf __temp = vec_splat((__v4sf)__B, 0);
1118   __v2df __res;
1119   /* Convert single float scalar to double in a vector.  */
1120   __asm__("xscvspdp %x0,%x1" : "=wa"(__res) : "wa"(__temp) :);
1121   return (__m128d)vec_mergel(__res, (__v2df)__A);
1122 #else
1123   __v2df __res = (__v2df)__A;
1124   __res[0] = ((__v4sf)__B)[0];
1125   return (__m128d)__res;
1126 #endif
1127 }
1128 
1129 extern __inline __m128d
1130     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_pd(__m128d __A,__m128d __B,const int __mask)1131     _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) {
1132   __vector double __result;
1133   const int __litmsk = __mask & 0x3;
1134 
1135   if (__litmsk == 0)
1136     __result = vec_mergeh(__A, __B);
1137 #if __GNUC__ < 6
1138   else if (__litmsk == 1)
1139     __result = vec_xxpermdi(__B, __A, 2);
1140   else if (__litmsk == 2)
1141     __result = vec_xxpermdi(__B, __A, 1);
1142 #else
1143   else if (__litmsk == 1)
1144     __result = vec_xxpermdi(__A, __B, 2);
1145   else if (__litmsk == 2)
1146     __result = vec_xxpermdi(__A, __B, 1);
1147 #endif
1148   else
1149     __result = vec_mergel(__A, __B);
1150 
1151   return __result;
1152 }
1153 
1154 extern __inline __m128d
1155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_pd(__m128d __A,__m128d __B)1156     _mm_unpackhi_pd(__m128d __A, __m128d __B) {
1157   return (__m128d)vec_mergel((__v2df)__A, (__v2df)__B);
1158 }
1159 
1160 extern __inline __m128d
1161     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_pd(__m128d __A,__m128d __B)1162     _mm_unpacklo_pd(__m128d __A, __m128d __B) {
1163   return (__m128d)vec_mergeh((__v2df)__A, (__v2df)__B);
1164 }
1165 
1166 extern __inline __m128d
1167     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadh_pd(__m128d __A,double const * __B)1168     _mm_loadh_pd(__m128d __A, double const *__B) {
1169   __v2df __result = (__v2df)__A;
1170   __result[1] = *__B;
1171   return (__m128d)__result;
1172 }
1173 
1174 extern __inline __m128d
1175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loadl_pd(__m128d __A,double const * __B)1176     _mm_loadl_pd(__m128d __A, double const *__B) {
1177   __v2df __result = (__v2df)__A;
1178   __result[0] = *__B;
1179   return (__m128d)__result;
1180 }
1181 
1182 #ifdef _ARCH_PWR8
1183 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1184 
1185 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1186 extern __inline int
1187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd(__m128d __A)1188     _mm_movemask_pd(__m128d __A) {
1189 #ifdef _ARCH_PWR10
1190   return vec_extractm((__v2du)__A);
1191 #else
1192   __vector unsigned long long __result;
1193   static const __vector unsigned int __perm_mask = {
1194 #ifdef __LITTLE_ENDIAN__
1195       0x80800040, 0x80808080, 0x80808080, 0x80808080
1196 #else
1197       0x80808080, 0x80808080, 0x80808080, 0x80804000
1198 #endif
1199   };
1200 
1201   __result = ((__vector unsigned long long)vec_vbpermq(
1202       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1203 
1204 #ifdef __LITTLE_ENDIAN__
1205   return __result[1];
1206 #else
1207   return __result[0];
1208 #endif
1209 #endif /* !_ARCH_PWR10 */
1210 }
1211 #endif /* _ARCH_PWR8 */
1212 
1213 extern __inline __m128i
1214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi16(__m128i __A,__m128i __B)1215     _mm_packs_epi16(__m128i __A, __m128i __B) {
1216   return (__m128i)vec_packs((__v8hi)__A, (__v8hi)__B);
1217 }
1218 
1219 extern __inline __m128i
1220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packs_epi32(__m128i __A,__m128i __B)1221     _mm_packs_epi32(__m128i __A, __m128i __B) {
1222   return (__m128i)vec_packs((__v4si)__A, (__v4si)__B);
1223 }
1224 
1225 extern __inline __m128i
1226     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_packus_epi16(__m128i __A,__m128i __B)1227     _mm_packus_epi16(__m128i __A, __m128i __B) {
1228   return (__m128i)vec_packsu((__v8hi)__A, (__v8hi)__B);
1229 }
1230 
1231 extern __inline __m128i
1232     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi8(__m128i __A,__m128i __B)1233     _mm_unpackhi_epi8(__m128i __A, __m128i __B) {
1234   return (__m128i)vec_mergel((__v16qu)__A, (__v16qu)__B);
1235 }
1236 
1237 extern __inline __m128i
1238     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi16(__m128i __A,__m128i __B)1239     _mm_unpackhi_epi16(__m128i __A, __m128i __B) {
1240   return (__m128i)vec_mergel((__v8hu)__A, (__v8hu)__B);
1241 }
1242 
1243 extern __inline __m128i
1244     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi32(__m128i __A,__m128i __B)1245     _mm_unpackhi_epi32(__m128i __A, __m128i __B) {
1246   return (__m128i)vec_mergel((__v4su)__A, (__v4su)__B);
1247 }
1248 
1249 extern __inline __m128i
1250     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpackhi_epi64(__m128i __A,__m128i __B)1251     _mm_unpackhi_epi64(__m128i __A, __m128i __B) {
1252   return (__m128i)vec_mergel((__vector long long)__A, (__vector long long)__B);
1253 }
1254 
1255 extern __inline __m128i
1256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi8(__m128i __A,__m128i __B)1257     _mm_unpacklo_epi8(__m128i __A, __m128i __B) {
1258   return (__m128i)vec_mergeh((__v16qu)__A, (__v16qu)__B);
1259 }
1260 
1261 extern __inline __m128i
1262     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi16(__m128i __A,__m128i __B)1263     _mm_unpacklo_epi16(__m128i __A, __m128i __B) {
1264   return (__m128i)vec_mergeh((__v8hi)__A, (__v8hi)__B);
1265 }
1266 
1267 extern __inline __m128i
1268     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi32(__m128i __A,__m128i __B)1269     _mm_unpacklo_epi32(__m128i __A, __m128i __B) {
1270   return (__m128i)vec_mergeh((__v4si)__A, (__v4si)__B);
1271 }
1272 
1273 extern __inline __m128i
1274     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64(__m128i __A,__m128i __B)1275     _mm_unpacklo_epi64(__m128i __A, __m128i __B) {
1276   return (__m128i)vec_mergeh((__vector long long)__A, (__vector long long)__B);
1277 }
1278 
1279 extern __inline __m128i
1280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8(__m128i __A,__m128i __B)1281     _mm_add_epi8(__m128i __A, __m128i __B) {
1282   return (__m128i)((__v16qu)__A + (__v16qu)__B);
1283 }
1284 
1285 extern __inline __m128i
1286     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16(__m128i __A,__m128i __B)1287     _mm_add_epi16(__m128i __A, __m128i __B) {
1288   return (__m128i)((__v8hu)__A + (__v8hu)__B);
1289 }
1290 
1291 extern __inline __m128i
1292     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32(__m128i __A,__m128i __B)1293     _mm_add_epi32(__m128i __A, __m128i __B) {
1294   return (__m128i)((__v4su)__A + (__v4su)__B);
1295 }
1296 
1297 extern __inline __m128i
1298     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64(__m128i __A,__m128i __B)1299     _mm_add_epi64(__m128i __A, __m128i __B) {
1300   return (__m128i)((__v2du)__A + (__v2du)__B);
1301 }
1302 
1303 extern __inline __m128i
1304     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8(__m128i __A,__m128i __B)1305     _mm_adds_epi8(__m128i __A, __m128i __B) {
1306   return (__m128i)vec_adds((__v16qi)__A, (__v16qi)__B);
1307 }
1308 
1309 extern __inline __m128i
1310     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16(__m128i __A,__m128i __B)1311     _mm_adds_epi16(__m128i __A, __m128i __B) {
1312   return (__m128i)vec_adds((__v8hi)__A, (__v8hi)__B);
1313 }
1314 
1315 extern __inline __m128i
1316     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu8(__m128i __A,__m128i __B)1317     _mm_adds_epu8(__m128i __A, __m128i __B) {
1318   return (__m128i)vec_adds((__v16qu)__A, (__v16qu)__B);
1319 }
1320 
1321 extern __inline __m128i
1322     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16(__m128i __A,__m128i __B)1323     _mm_adds_epu16(__m128i __A, __m128i __B) {
1324   return (__m128i)vec_adds((__v8hu)__A, (__v8hu)__B);
1325 }
1326 
1327 extern __inline __m128i
1328     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8(__m128i __A,__m128i __B)1329     _mm_sub_epi8(__m128i __A, __m128i __B) {
1330   return (__m128i)((__v16qu)__A - (__v16qu)__B);
1331 }
1332 
1333 extern __inline __m128i
1334     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16(__m128i __A,__m128i __B)1335     _mm_sub_epi16(__m128i __A, __m128i __B) {
1336   return (__m128i)((__v8hu)__A - (__v8hu)__B);
1337 }
1338 
1339 extern __inline __m128i
1340     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32(__m128i __A,__m128i __B)1341     _mm_sub_epi32(__m128i __A, __m128i __B) {
1342   return (__m128i)((__v4su)__A - (__v4su)__B);
1343 }
1344 
1345 extern __inline __m128i
1346     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64(__m128i __A,__m128i __B)1347     _mm_sub_epi64(__m128i __A, __m128i __B) {
1348   return (__m128i)((__v2du)__A - (__v2du)__B);
1349 }
1350 
1351 extern __inline __m128i
1352     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8(__m128i __A,__m128i __B)1353     _mm_subs_epi8(__m128i __A, __m128i __B) {
1354   return (__m128i)vec_subs((__v16qi)__A, (__v16qi)__B);
1355 }
1356 
1357 extern __inline __m128i
1358     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16(__m128i __A,__m128i __B)1359     _mm_subs_epi16(__m128i __A, __m128i __B) {
1360   return (__m128i)vec_subs((__v8hi)__A, (__v8hi)__B);
1361 }
1362 
1363 extern __inline __m128i
1364     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu8(__m128i __A,__m128i __B)1365     _mm_subs_epu8(__m128i __A, __m128i __B) {
1366   return (__m128i)vec_subs((__v16qu)__A, (__v16qu)__B);
1367 }
1368 
1369 extern __inline __m128i
1370     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epu16(__m128i __A,__m128i __B)1371     _mm_subs_epu16(__m128i __A, __m128i __B) {
1372   return (__m128i)vec_subs((__v8hu)__A, (__v8hu)__B);
1373 }
1374 
1375 extern __inline __m128i
1376     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_madd_epi16(__m128i __A,__m128i __B)1377     _mm_madd_epi16(__m128i __A, __m128i __B) {
1378   __vector signed int __zero = {0, 0, 0, 0};
1379 
1380   return (__m128i)vec_vmsumshm((__v8hi)__A, (__v8hi)__B, __zero);
1381 }
1382 
1383 extern __inline __m128i
1384     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16(__m128i __A,__m128i __B)1385     _mm_mulhi_epi16(__m128i __A, __m128i __B) {
1386   __vector signed int __w0, __w1;
1387 
1388   __vector unsigned char __xform1 = {
1389 #ifdef __LITTLE_ENDIAN__
1390       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1391       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1392 #else
1393       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1394       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1395 #endif
1396   };
1397 
1398   __w0 = vec_vmulesh((__v8hi)__A, (__v8hi)__B);
1399   __w1 = vec_vmulosh((__v8hi)__A, (__v8hi)__B);
1400   return (__m128i)vec_perm(__w0, __w1, __xform1);
1401 }
1402 
1403 extern __inline __m128i
1404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16(__m128i __A,__m128i __B)1405     _mm_mullo_epi16(__m128i __A, __m128i __B) {
1406   return (__m128i)((__v8hi)__A * (__v8hi)__B);
1407 }
1408 
1409 extern __inline __m64
1410     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32(__m64 __A,__m64 __B)1411     _mm_mul_su32(__m64 __A, __m64 __B) {
1412   unsigned int __a = __A;
1413   unsigned int __b = __B;
1414 
1415   return ((__m64)__a * (__m64)__b);
1416 }
1417 
1418 #ifdef _ARCH_PWR8
1419 extern __inline __m128i
1420     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32(__m128i __A,__m128i __B)1421     _mm_mul_epu32(__m128i __A, __m128i __B) {
1422 #if __GNUC__ < 8
1423   __v2du __result;
1424 
1425 #ifdef __LITTLE_ENDIAN__
1426   /* VMX Vector Multiply Odd Unsigned Word.  */
1427   __asm__("vmulouw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1428 #else
1429   /* VMX Vector Multiply Even Unsigned Word.  */
1430   __asm__("vmuleuw %0,%1,%2" : "=v"(__result) : "v"(__A), "v"(__B) :);
1431 #endif
1432   return (__m128i)__result;
1433 #else
1434   return (__m128i)vec_mule((__v4su)__A, (__v4su)__B);
1435 #endif
1436 }
1437 #endif
1438 
1439 extern __inline __m128i
1440     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16(__m128i __A,int __B)1441     _mm_slli_epi16(__m128i __A, int __B) {
1442   __v8hu __lshift;
1443   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1444 
1445   if (__B >= 0 && __B < 16) {
1446     if (__builtin_constant_p(__B))
1447       __lshift = (__v8hu)vec_splat_s16(__B);
1448     else
1449       __lshift = vec_splats((unsigned short)__B);
1450 
1451     __result = vec_sl((__v8hi)__A, __lshift);
1452   }
1453 
1454   return (__m128i)__result;
1455 }
1456 
1457 extern __inline __m128i
1458     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi32(__m128i __A,int __B)1459     _mm_slli_epi32(__m128i __A, int __B) {
1460   __v4su __lshift;
1461   __v4si __result = {0, 0, 0, 0};
1462 
1463   if (__B >= 0 && __B < 32) {
1464     if (__builtin_constant_p(__B) && __B < 16)
1465       __lshift = (__v4su)vec_splat_s32(__B);
1466     else
1467       __lshift = vec_splats((unsigned int)__B);
1468 
1469     __result = vec_sl((__v4si)__A, __lshift);
1470   }
1471 
1472   return (__m128i)__result;
1473 }
1474 
1475 #ifdef _ARCH_PWR8
1476 extern __inline __m128i
1477     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi64(__m128i __A,int __B)1478     _mm_slli_epi64(__m128i __A, int __B) {
1479   __v2du __lshift;
1480   __v2di __result = {0, 0};
1481 
1482   if (__B >= 0 && __B < 64) {
1483     if (__builtin_constant_p(__B) && __B < 16)
1484       __lshift = (__v2du)vec_splat_s32(__B);
1485     else
1486       __lshift = (__v2du)vec_splats((unsigned int)__B);
1487 
1488     __result = vec_sl((__v2di)__A, __lshift);
1489   }
1490 
1491   return (__m128i)__result;
1492 }
1493 #endif
1494 
1495 extern __inline __m128i
1496     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi16(__m128i __A,int __B)1497     _mm_srai_epi16(__m128i __A, int __B) {
1498   __v8hu __rshift = {15, 15, 15, 15, 15, 15, 15, 15};
1499   __v8hi __result;
1500 
1501   if (__B < 16) {
1502     if (__builtin_constant_p(__B))
1503       __rshift = (__v8hu)vec_splat_s16(__B);
1504     else
1505       __rshift = vec_splats((unsigned short)__B);
1506   }
1507   __result = vec_sra((__v8hi)__A, __rshift);
1508 
1509   return (__m128i)__result;
1510 }
1511 
1512 extern __inline __m128i
1513     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srai_epi32(__m128i __A,int __B)1514     _mm_srai_epi32(__m128i __A, int __B) {
1515   __v4su __rshift = {31, 31, 31, 31};
1516   __v4si __result;
1517 
1518   if (__B < 32) {
1519     if (__builtin_constant_p(__B)) {
1520       if (__B < 16)
1521         __rshift = (__v4su)vec_splat_s32(__B);
1522       else
1523         __rshift = (__v4su)vec_splats((unsigned int)__B);
1524     } else
1525       __rshift = vec_splats((unsigned int)__B);
1526   }
1527   __result = vec_sra((__v4si)__A, __rshift);
1528 
1529   return (__m128i)__result;
1530 }
1531 
1532 extern __inline __m128i
1533     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bslli_si128(__m128i __A,const int __N)1534     _mm_bslli_si128(__m128i __A, const int __N) {
1535   __v16qu __result;
1536   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1537 
1538   if (__N < 16)
1539     __result = vec_sld((__v16qu)__A, __zeros, __N);
1540   else
1541     __result = __zeros;
1542 
1543   return (__m128i)__result;
1544 }
1545 
1546 extern __inline __m128i
1547     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_bsrli_si128(__m128i __A,const int __N)1548     _mm_bsrli_si128(__m128i __A, const int __N) {
1549   __v16qu __result;
1550   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1551 
1552   if (__N < 16)
1553 #ifdef __LITTLE_ENDIAN__
1554     if (__builtin_constant_p(__N))
1555       /* Would like to use Vector Shift Left Double by Octet
1556          Immediate here to use the immediate form and avoid
1557          load of __N * 8 value into a separate VR.  */
1558       __result = vec_sld(__zeros, (__v16qu)__A, (16 - __N));
1559     else
1560 #endif
1561     {
1562       __v16qu __shift = vec_splats((unsigned char)(__N * 8));
1563 #ifdef __LITTLE_ENDIAN__
1564       __result = vec_sro((__v16qu)__A, __shift);
1565 #else
1566     __result = vec_slo((__v16qu)__A, __shift);
1567 #endif
1568     }
1569   else
1570     __result = __zeros;
1571 
1572   return (__m128i)__result;
1573 }
1574 
1575 extern __inline __m128i
1576     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_si128(__m128i __A,const int __N)1577     _mm_srli_si128(__m128i __A, const int __N) {
1578   return _mm_bsrli_si128(__A, __N);
1579 }
1580 
1581 extern __inline __m128i
1582     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_si128(__m128i __A,const int _imm5)1583     _mm_slli_si128(__m128i __A, const int _imm5) {
1584   __v16qu __result;
1585   const __v16qu __zeros = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
1586 
1587   if (_imm5 < 16)
1588 #ifdef __LITTLE_ENDIAN__
1589     __result = vec_sld((__v16qu)__A, __zeros, _imm5);
1590 #else
1591     __result = vec_sld(__zeros, (__v16qu)__A, (16 - _imm5));
1592 #endif
1593   else
1594     __result = __zeros;
1595 
1596   return (__m128i)__result;
1597 }
1598 
1599 extern __inline __m128i
1600     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1601 
_mm_srli_epi16(__m128i __A,int __B)1602     _mm_srli_epi16(__m128i __A, int __B) {
1603   __v8hu __rshift;
1604   __v8hi __result = {0, 0, 0, 0, 0, 0, 0, 0};
1605 
1606   if (__B < 16) {
1607     if (__builtin_constant_p(__B))
1608       __rshift = (__v8hu)vec_splat_s16(__B);
1609     else
1610       __rshift = vec_splats((unsigned short)__B);
1611 
1612     __result = vec_sr((__v8hi)__A, __rshift);
1613   }
1614 
1615   return (__m128i)__result;
1616 }
1617 
1618 extern __inline __m128i
1619     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi32(__m128i __A,int __B)1620     _mm_srli_epi32(__m128i __A, int __B) {
1621   __v4su __rshift;
1622   __v4si __result = {0, 0, 0, 0};
1623 
1624   if (__B < 32) {
1625     if (__builtin_constant_p(__B)) {
1626       if (__B < 16)
1627         __rshift = (__v4su)vec_splat_s32(__B);
1628       else
1629         __rshift = (__v4su)vec_splats((unsigned int)__B);
1630     } else
1631       __rshift = vec_splats((unsigned int)__B);
1632 
1633     __result = vec_sr((__v4si)__A, __rshift);
1634   }
1635 
1636   return (__m128i)__result;
1637 }
1638 
1639 #ifdef _ARCH_PWR8
1640 extern __inline __m128i
1641     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srli_epi64(__m128i __A,int __B)1642     _mm_srli_epi64(__m128i __A, int __B) {
1643   __v2du __rshift;
1644   __v2di __result = {0, 0};
1645 
1646   if (__B < 64) {
1647     if (__builtin_constant_p(__B)) {
1648       if (__B < 16)
1649         __rshift = (__v2du)vec_splat_s32(__B);
1650       else
1651         __rshift = (__v2du)vec_splats((unsigned long long)__B);
1652     } else
1653       __rshift = (__v2du)vec_splats((unsigned int)__B);
1654 
1655     __result = vec_sr((__v2di)__A, __rshift);
1656   }
1657 
1658   return (__m128i)__result;
1659 }
1660 #endif
1661 
1662 extern __inline __m128i
1663     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi16(__m128i __A,__m128i __B)1664     _mm_sll_epi16(__m128i __A, __m128i __B) {
1665   __v8hu __lshift;
1666   __vector __bool short __shmask;
1667   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1668   __v8hu __result;
1669 
1670 #ifdef __LITTLE_ENDIAN__
1671   __lshift = vec_splat((__v8hu)__B, 0);
1672 #else
1673   __lshift = vec_splat((__v8hu)__B, 3);
1674 #endif
1675   __shmask = vec_cmple(__lshift, __shmax);
1676   __result = vec_sl((__v8hu)__A, __lshift);
1677   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1678 
1679   return (__m128i)__result;
1680 }
1681 
1682 extern __inline __m128i
1683     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi32(__m128i __A,__m128i __B)1684     _mm_sll_epi32(__m128i __A, __m128i __B) {
1685   __v4su __lshift;
1686   __vector __bool int __shmask;
1687   const __v4su __shmax = {32, 32, 32, 32};
1688   __v4su __result;
1689 #ifdef __LITTLE_ENDIAN__
1690   __lshift = vec_splat((__v4su)__B, 0);
1691 #else
1692   __lshift = vec_splat((__v4su)__B, 1);
1693 #endif
1694   __shmask = vec_cmplt(__lshift, __shmax);
1695   __result = vec_sl((__v4su)__A, __lshift);
1696   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1697 
1698   return (__m128i)__result;
1699 }
1700 
1701 #ifdef _ARCH_PWR8
1702 extern __inline __m128i
1703     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sll_epi64(__m128i __A,__m128i __B)1704     _mm_sll_epi64(__m128i __A, __m128i __B) {
1705   __v2du __lshift;
1706   __vector __bool long long __shmask;
1707   const __v2du __shmax = {64, 64};
1708   __v2du __result;
1709 
1710   __lshift = vec_splat((__v2du)__B, 0);
1711   __shmask = vec_cmplt(__lshift, __shmax);
1712   __result = vec_sl((__v2du)__A, __lshift);
1713   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1714 
1715   return (__m128i)__result;
1716 }
1717 #endif
1718 
1719 extern __inline __m128i
1720     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi16(__m128i __A,__m128i __B)1721     _mm_sra_epi16(__m128i __A, __m128i __B) {
1722   const __v8hu __rshmax = {15, 15, 15, 15, 15, 15, 15, 15};
1723   __v8hu __rshift;
1724   __v8hi __result;
1725 
1726 #ifdef __LITTLE_ENDIAN__
1727   __rshift = vec_splat((__v8hu)__B, 0);
1728 #else
1729   __rshift = vec_splat((__v8hu)__B, 3);
1730 #endif
1731   __rshift = vec_min(__rshift, __rshmax);
1732   __result = vec_sra((__v8hi)__A, __rshift);
1733 
1734   return (__m128i)__result;
1735 }
1736 
1737 extern __inline __m128i
1738     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sra_epi32(__m128i __A,__m128i __B)1739     _mm_sra_epi32(__m128i __A, __m128i __B) {
1740   const __v4su __rshmax = {31, 31, 31, 31};
1741   __v4su __rshift;
1742   __v4si __result;
1743 
1744 #ifdef __LITTLE_ENDIAN__
1745   __rshift = vec_splat((__v4su)__B, 0);
1746 #else
1747   __rshift = vec_splat((__v4su)__B, 1);
1748 #endif
1749   __rshift = vec_min(__rshift, __rshmax);
1750   __result = vec_sra((__v4si)__A, __rshift);
1751 
1752   return (__m128i)__result;
1753 }
1754 
1755 extern __inline __m128i
1756     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi16(__m128i __A,__m128i __B)1757     _mm_srl_epi16(__m128i __A, __m128i __B) {
1758   __v8hu __rshift;
1759   __vector __bool short __shmask;
1760   const __v8hu __shmax = {15, 15, 15, 15, 15, 15, 15, 15};
1761   __v8hu __result;
1762 
1763 #ifdef __LITTLE_ENDIAN__
1764   __rshift = vec_splat((__v8hu)__B, 0);
1765 #else
1766   __rshift = vec_splat((__v8hu)__B, 3);
1767 #endif
1768   __shmask = vec_cmple(__rshift, __shmax);
1769   __result = vec_sr((__v8hu)__A, __rshift);
1770   __result = vec_sel((__v8hu)__shmask, __result, __shmask);
1771 
1772   return (__m128i)__result;
1773 }
1774 
1775 extern __inline __m128i
1776     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi32(__m128i __A,__m128i __B)1777     _mm_srl_epi32(__m128i __A, __m128i __B) {
1778   __v4su __rshift;
1779   __vector __bool int __shmask;
1780   const __v4su __shmax = {32, 32, 32, 32};
1781   __v4su __result;
1782 
1783 #ifdef __LITTLE_ENDIAN__
1784   __rshift = vec_splat((__v4su)__B, 0);
1785 #else
1786   __rshift = vec_splat((__v4su)__B, 1);
1787 #endif
1788   __shmask = vec_cmplt(__rshift, __shmax);
1789   __result = vec_sr((__v4su)__A, __rshift);
1790   __result = vec_sel((__v4su)__shmask, __result, __shmask);
1791 
1792   return (__m128i)__result;
1793 }
1794 
1795 #ifdef _ARCH_PWR8
1796 extern __inline __m128i
1797     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_srl_epi64(__m128i __A,__m128i __B)1798     _mm_srl_epi64(__m128i __A, __m128i __B) {
1799   __v2du __rshift;
1800   __vector __bool long long __shmask;
1801   const __v2du __shmax = {64, 64};
1802   __v2du __result;
1803 
1804   __rshift = vec_splat((__v2du)__B, 0);
1805   __shmask = vec_cmplt(__rshift, __shmax);
1806   __result = vec_sr((__v2du)__A, __rshift);
1807   __result = vec_sel((__v2du)__shmask, __result, __shmask);
1808 
1809   return (__m128i)__result;
1810 }
1811 #endif
1812 
1813 extern __inline __m128d
1814     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_pd(__m128d __A,__m128d __B)1815     _mm_and_pd(__m128d __A, __m128d __B) {
1816   return (vec_and((__v2df)__A, (__v2df)__B));
1817 }
1818 
1819 extern __inline __m128d
1820     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_pd(__m128d __A,__m128d __B)1821     _mm_andnot_pd(__m128d __A, __m128d __B) {
1822   return (vec_andc((__v2df)__B, (__v2df)__A));
1823 }
1824 
1825 extern __inline __m128d
1826     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_pd(__m128d __A,__m128d __B)1827     _mm_or_pd(__m128d __A, __m128d __B) {
1828   return (vec_or((__v2df)__A, (__v2df)__B));
1829 }
1830 
1831 extern __inline __m128d
1832     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_pd(__m128d __A,__m128d __B)1833     _mm_xor_pd(__m128d __A, __m128d __B) {
1834   return (vec_xor((__v2df)__A, (__v2df)__B));
1835 }
1836 
1837 extern __inline __m128i
1838     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_and_si128(__m128i __A,__m128i __B)1839     _mm_and_si128(__m128i __A, __m128i __B) {
1840   return (__m128i)vec_and((__v2di)__A, (__v2di)__B);
1841 }
1842 
1843 extern __inline __m128i
1844     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_andnot_si128(__m128i __A,__m128i __B)1845     _mm_andnot_si128(__m128i __A, __m128i __B) {
1846   return (__m128i)vec_andc((__v2di)__B, (__v2di)__A);
1847 }
1848 
1849 extern __inline __m128i
1850     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_or_si128(__m128i __A,__m128i __B)1851     _mm_or_si128(__m128i __A, __m128i __B) {
1852   return (__m128i)vec_or((__v2di)__A, (__v2di)__B);
1853 }
1854 
1855 extern __inline __m128i
1856     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_xor_si128(__m128i __A,__m128i __B)1857     _mm_xor_si128(__m128i __A, __m128i __B) {
1858   return (__m128i)vec_xor((__v2di)__A, (__v2di)__B);
1859 }
1860 
1861 extern __inline __m128i
1862     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi8(__m128i __A,__m128i __B)1863     _mm_cmpeq_epi8(__m128i __A, __m128i __B) {
1864   return (__m128i)vec_cmpeq((__v16qi)__A, (__v16qi)__B);
1865 }
1866 
1867 extern __inline __m128i
1868     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi16(__m128i __A,__m128i __B)1869     _mm_cmpeq_epi16(__m128i __A, __m128i __B) {
1870   return (__m128i)vec_cmpeq((__v8hi)__A, (__v8hi)__B);
1871 }
1872 
1873 extern __inline __m128i
1874     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpeq_epi32(__m128i __A,__m128i __B)1875     _mm_cmpeq_epi32(__m128i __A, __m128i __B) {
1876   return (__m128i)vec_cmpeq((__v4si)__A, (__v4si)__B);
1877 }
1878 
1879 extern __inline __m128i
1880     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi8(__m128i __A,__m128i __B)1881     _mm_cmplt_epi8(__m128i __A, __m128i __B) {
1882   return (__m128i)vec_cmplt((__v16qi)__A, (__v16qi)__B);
1883 }
1884 
1885 extern __inline __m128i
1886     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi16(__m128i __A,__m128i __B)1887     _mm_cmplt_epi16(__m128i __A, __m128i __B) {
1888   return (__m128i)vec_cmplt((__v8hi)__A, (__v8hi)__B);
1889 }
1890 
1891 extern __inline __m128i
1892     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmplt_epi32(__m128i __A,__m128i __B)1893     _mm_cmplt_epi32(__m128i __A, __m128i __B) {
1894   return (__m128i)vec_cmplt((__v4si)__A, (__v4si)__B);
1895 }
1896 
1897 extern __inline __m128i
1898     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi8(__m128i __A,__m128i __B)1899     _mm_cmpgt_epi8(__m128i __A, __m128i __B) {
1900   return (__m128i)vec_cmpgt((__v16qi)__A, (__v16qi)__B);
1901 }
1902 
1903 extern __inline __m128i
1904     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi16(__m128i __A,__m128i __B)1905     _mm_cmpgt_epi16(__m128i __A, __m128i __B) {
1906   return (__m128i)vec_cmpgt((__v8hi)__A, (__v8hi)__B);
1907 }
1908 
1909 extern __inline __m128i
1910     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpgt_epi32(__m128i __A,__m128i __B)1911     _mm_cmpgt_epi32(__m128i __A, __m128i __B) {
1912   return (__m128i)vec_cmpgt((__v4si)__A, (__v4si)__B);
1913 }
1914 
1915 extern __inline int
1916     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_extract_epi16(__m128i const __A,int const __N)1917     _mm_extract_epi16(__m128i const __A, int const __N) {
1918   return (unsigned short)((__v8hi)__A)[__N & 7];
1919 }
1920 
1921 extern __inline __m128i
1922     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_insert_epi16(__m128i const __A,int const __D,int const __N)1923     _mm_insert_epi16(__m128i const __A, int const __D, int const __N) {
1924   __v8hi __result = (__v8hi)__A;
1925 
1926   __result[(__N & 7)] = __D;
1927 
1928   return (__m128i)__result;
1929 }
1930 
1931 extern __inline __m128i
1932     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epi16(__m128i __A,__m128i __B)1933     _mm_max_epi16(__m128i __A, __m128i __B) {
1934   return (__m128i)vec_max((__v8hi)__A, (__v8hi)__B);
1935 }
1936 
1937 extern __inline __m128i
1938     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_max_epu8(__m128i __A,__m128i __B)1939     _mm_max_epu8(__m128i __A, __m128i __B) {
1940   return (__m128i)vec_max((__v16qu)__A, (__v16qu)__B);
1941 }
1942 
1943 extern __inline __m128i
1944     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epi16(__m128i __A,__m128i __B)1945     _mm_min_epi16(__m128i __A, __m128i __B) {
1946   return (__m128i)vec_min((__v8hi)__A, (__v8hi)__B);
1947 }
1948 
1949 extern __inline __m128i
1950     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_min_epu8(__m128i __A,__m128i __B)1951     _mm_min_epu8(__m128i __A, __m128i __B) {
1952   return (__m128i)vec_min((__v16qu)__A, (__v16qu)__B);
1953 }
1954 
1955 #ifdef _ARCH_PWR8
1956 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1957 
1958 /* Return a mask created from the most significant bit of each 8-bit
1959    element in A.  */
1960 extern __inline int
1961     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8(__m128i __A)1962     _mm_movemask_epi8(__m128i __A) {
1963 #ifdef _ARCH_PWR10
1964   return vec_extractm((__v16qu)__A);
1965 #else
1966   __vector unsigned long long __result;
1967   static const __vector unsigned char __perm_mask = {
1968       0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
1969       0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00};
1970 
1971   __result = ((__vector unsigned long long)vec_vbpermq(
1972       (__vector unsigned char)__A, (__vector unsigned char)__perm_mask));
1973 
1974 #ifdef __LITTLE_ENDIAN__
1975   return __result[1];
1976 #else
1977   return __result[0];
1978 #endif
1979 #endif /* !_ARCH_PWR10 */
1980 }
1981 #endif /* _ARCH_PWR8 */
1982 
1983 extern __inline __m128i
1984     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epu16(__m128i __A,__m128i __B)1985     _mm_mulhi_epu16(__m128i __A, __m128i __B) {
1986   __v4su __w0, __w1;
1987   __v16qu __xform1 = {
1988 #ifdef __LITTLE_ENDIAN__
1989       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1990       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1991 #else
1992       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x08,
1993       0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1994 #endif
1995   };
1996 
1997   __w0 = vec_vmuleuh((__v8hu)__A, (__v8hu)__B);
1998   __w1 = vec_vmulouh((__v8hu)__A, (__v8hu)__B);
1999   return (__m128i)vec_perm(__w0, __w1, __xform1);
2000 }
2001 
2002 extern __inline __m128i
2003     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflehi_epi16(__m128i __A,const int __mask)2004     _mm_shufflehi_epi16(__m128i __A, const int __mask) {
2005   unsigned long __element_selector_98 = __mask & 0x03;
2006   unsigned long __element_selector_BA = (__mask >> 2) & 0x03;
2007   unsigned long __element_selector_DC = (__mask >> 4) & 0x03;
2008   unsigned long __element_selector_FE = (__mask >> 6) & 0x03;
2009   static const unsigned short __permute_selectors[4] = {
2010 #ifdef __LITTLE_ENDIAN__
2011       0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2012 #else
2013       0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2014 #endif
2015   };
2016   __v2du __pmask =
2017 #ifdef __LITTLE_ENDIAN__
2018       {0x1716151413121110UL, 0UL};
2019 #else
2020       {0x1011121314151617UL, 0UL};
2021 #endif
2022   __m64_union __t;
2023   __v2du __a, __r;
2024 
2025   __t.as_short[0] = __permute_selectors[__element_selector_98];
2026   __t.as_short[1] = __permute_selectors[__element_selector_BA];
2027   __t.as_short[2] = __permute_selectors[__element_selector_DC];
2028   __t.as_short[3] = __permute_selectors[__element_selector_FE];
2029   __pmask[1] = __t.as_m64;
2030   __a = (__v2du)__A;
2031   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2032   return (__m128i)__r;
2033 }
2034 
2035 extern __inline __m128i
2036     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shufflelo_epi16(__m128i __A,const int __mask)2037     _mm_shufflelo_epi16(__m128i __A, const int __mask) {
2038   unsigned long __element_selector_10 = __mask & 0x03;
2039   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2040   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2041   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2042   static const unsigned short __permute_selectors[4] = {
2043 #ifdef __LITTLE_ENDIAN__
2044       0x0100, 0x0302, 0x0504, 0x0706
2045 #else
2046       0x0001, 0x0203, 0x0405, 0x0607
2047 #endif
2048   };
2049   __v2du __pmask =
2050 #ifdef __LITTLE_ENDIAN__
2051       {0UL, 0x1f1e1d1c1b1a1918UL};
2052 #else
2053       {0UL, 0x18191a1b1c1d1e1fUL};
2054 #endif
2055   __m64_union __t;
2056   __v2du __a, __r;
2057   __t.as_short[0] = __permute_selectors[__element_selector_10];
2058   __t.as_short[1] = __permute_selectors[__element_selector_32];
2059   __t.as_short[2] = __permute_selectors[__element_selector_54];
2060   __t.as_short[3] = __permute_selectors[__element_selector_76];
2061   __pmask[0] = __t.as_m64;
2062   __a = (__v2du)__A;
2063   __r = vec_perm(__a, __a, (__vector unsigned char)__pmask);
2064   return (__m128i)__r;
2065 }
2066 
2067 extern __inline __m128i
2068     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_shuffle_epi32(__m128i __A,const int __mask)2069     _mm_shuffle_epi32(__m128i __A, const int __mask) {
2070   unsigned long __element_selector_10 = __mask & 0x03;
2071   unsigned long __element_selector_32 = (__mask >> 2) & 0x03;
2072   unsigned long __element_selector_54 = (__mask >> 4) & 0x03;
2073   unsigned long __element_selector_76 = (__mask >> 6) & 0x03;
2074   static const unsigned int __permute_selectors[4] = {
2075 #ifdef __LITTLE_ENDIAN__
2076       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2077 #else
2078       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2079 #endif
2080   };
2081   __v4su __t;
2082 
2083   __t[0] = __permute_selectors[__element_selector_10];
2084   __t[1] = __permute_selectors[__element_selector_32];
2085   __t[2] = __permute_selectors[__element_selector_54] + 0x10101010;
2086   __t[3] = __permute_selectors[__element_selector_76] + 0x10101010;
2087   return (__m128i)vec_perm((__v4si)__A, (__v4si)__A,
2088                            (__vector unsigned char)__t);
2089 }
2090 
2091 extern __inline void
2092     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_maskmoveu_si128(__m128i __A,__m128i __B,char * __C)2093     _mm_maskmoveu_si128(__m128i __A, __m128i __B, char *__C) {
2094   __v2du __hibit = {0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2095   __v16qu __mask, __tmp;
2096   __m128i_u *__p = (__m128i_u *)__C;
2097 
2098   __tmp = (__v16qu)_mm_loadu_si128(__p);
2099   __mask = (__v16qu)vec_cmpgt((__v16qu)__B, (__v16qu)__hibit);
2100   __tmp = vec_sel(__tmp, (__v16qu)__A, __mask);
2101   _mm_storeu_si128(__p, (__m128i)__tmp);
2102 }
2103 
2104 extern __inline __m128i
2105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu8(__m128i __A,__m128i __B)2106     _mm_avg_epu8(__m128i __A, __m128i __B) {
2107   return (__m128i)vec_avg((__v16qu)__A, (__v16qu)__B);
2108 }
2109 
2110 extern __inline __m128i
2111     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_avg_epu16(__m128i __A,__m128i __B)2112     _mm_avg_epu16(__m128i __A, __m128i __B) {
2113   return (__m128i)vec_avg((__v8hu)__A, (__v8hu)__B);
2114 }
2115 
2116 extern __inline __m128i
2117     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sad_epu8(__m128i __A,__m128i __B)2118     _mm_sad_epu8(__m128i __A, __m128i __B) {
2119   __v16qu __a, __b;
2120   __v16qu __vabsdiff;
2121   __v4si __vsum;
2122   const __v4su __zero = {0, 0, 0, 0};
2123   __v4si __result;
2124 
2125   __a = (__v16qu)__A;
2126   __b = (__v16qu)__B;
2127 #ifndef _ARCH_PWR9
2128   __v16qu __vmin = vec_min(__a, __b);
2129   __v16qu __vmax = vec_max(__a, __b);
2130   __vabsdiff = vec_sub(__vmax, __vmin);
2131 #else
2132   __vabsdiff = vec_absd(__a, __b);
2133 #endif
2134   /* Sum four groups of bytes into integers.  */
2135   __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero);
2136 #ifdef __LITTLE_ENDIAN__
2137   /* Sum across four integers with two integer results.  */
2138   __asm__("vsum2sws %0,%1,%2" : "=v"(__result) : "v"(__vsum), "v"(__zero));
2139   /* Note: vec_sum2s could be used here, but on little-endian, vector
2140      shifts are added that are not needed for this use-case.
2141      A vector shift to correctly position the 32-bit integer results
2142      (currently at [0] and [2]) to [1] and [3] would then need to be
2143      swapped back again since the desired results are two 64-bit
2144      integers ([1]|[0] and [3]|[2]).  Thus, no shift is performed.  */
2145 #else
2146   /* Sum across four integers with two integer results.  */
2147   __result = vec_sum2s(__vsum, (__vector signed int)__zero);
2148   /* Rotate the sums into the correct position.  */
2149   __result = vec_sld(__result, __result, 6);
2150 #endif
2151   return (__m128i)__result;
2152 }
2153 
2154 extern __inline void
2155     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si32(int * __A,int __B)2156     _mm_stream_si32(int *__A, int __B) {
2157   /* Use the data cache block touch for store transient.  */
2158   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2159   *__A = __B;
2160 }
2161 
2162 extern __inline void
2163     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si64(long long int * __A,long long int __B)2164     _mm_stream_si64(long long int *__A, long long int __B) {
2165   /* Use the data cache block touch for store transient.  */
2166   __asm__("	dcbtstt	0,%0" : : "b"(__A) : "memory");
2167   *__A = __B;
2168 }
2169 
2170 extern __inline void
2171     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_si128(__m128i * __A,__m128i __B)2172     _mm_stream_si128(__m128i *__A, __m128i __B) {
2173   /* Use the data cache block touch for store transient.  */
2174   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2175   *__A = __B;
2176 }
2177 
2178 extern __inline void
2179     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_stream_pd(double * __A,__m128d __B)2180     _mm_stream_pd(double *__A, __m128d __B) {
2181   /* Use the data cache block touch for store transient.  */
2182   __asm__("dcbtstt 0,%0" : : "b"(__A) : "memory");
2183   *(__m128d *)__A = __B;
2184 }
2185 
2186 extern __inline void
2187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_clflush(void const * __A)2188     _mm_clflush(void const *__A) {
2189   /* Use the data cache block flush.  */
2190   __asm__("dcbf 0,%0" : : "b"(__A) : "memory");
2191 }
2192 
2193 extern __inline void
2194     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_lfence(void)2195     _mm_lfence(void) {
2196   /* Use light weight sync for load to load ordering.  */
2197   __atomic_thread_fence(__ATOMIC_RELEASE);
2198 }
2199 
2200 extern __inline void
2201     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mfence(void)2202     _mm_mfence(void) {
2203   /* Use heavy weight sync for any to any ordering.  */
2204   __atomic_thread_fence(__ATOMIC_SEQ_CST);
2205 }
2206 
2207 extern __inline __m128i
2208     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi32_si128(int __A)2209     _mm_cvtsi32_si128(int __A) {
2210   return _mm_set_epi32(0, 0, 0, __A);
2211 }
2212 
2213 extern __inline __m128i
2214     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64_si128(long long __A)2215     _mm_cvtsi64_si128(long long __A) {
2216   return __extension__(__m128i)(__v2di){__A, 0LL};
2217 }
2218 
2219 /* Microsoft intrinsic.  */
2220 extern __inline __m128i
2221     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtsi64x_si128(long long __A)2222     _mm_cvtsi64x_si128(long long __A) {
2223   return __extension__(__m128i)(__v2di){__A, 0LL};
2224 }
2225 
2226 /* Casts between various SP, DP, INT vector types.  Note that these do no
2227    conversion of values, they just change the type.  */
2228 extern __inline __m128
2229     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_ps(__m128d __A)2230     _mm_castpd_ps(__m128d __A) {
2231   return (__m128)__A;
2232 }
2233 
2234 extern __inline __m128i
2235     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castpd_si128(__m128d __A)2236     _mm_castpd_si128(__m128d __A) {
2237   return (__m128i)__A;
2238 }
2239 
2240 extern __inline __m128d
2241     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_pd(__m128 __A)2242     _mm_castps_pd(__m128 __A) {
2243   return (__m128d)__A;
2244 }
2245 
2246 extern __inline __m128i
2247     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castps_si128(__m128 __A)2248     _mm_castps_si128(__m128 __A) {
2249   return (__m128i)__A;
2250 }
2251 
2252 extern __inline __m128
2253     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_ps(__m128i __A)2254     _mm_castsi128_ps(__m128i __A) {
2255   return (__m128)__A;
2256 }
2257 
2258 extern __inline __m128d
2259     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_castsi128_pd(__m128i __A)2260     _mm_castsi128_pd(__m128i __A) {
2261   return (__m128d)__A;
2262 }
2263 
2264 #else
2265 #include_next <emmintrin.h>
2266 #endif /* defined(__powerpc64__) &&                                            \
2267         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
2268 
2269 #endif /* EMMINTRIN_H_ */
2270