xref: /freebsd/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/emmintrin.h (revision 75bd29add25e772832ff0935add59dae218dcc67)
1 /*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18    PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19    However scalar float operations in vector (XMM) registers require
20    the POWER8 VSX ISA (2.07) level. There are differences for data
21    format and placement of float scalars in the vector register, which
22    require extra steps to match SSE2 scalar float semantics on POWER.
23 
24    It should be noted that there's much difference between X86_64's
25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26    portable <fenv.h> instead of access MXSCR directly.
27 
28    Most SSE2 scalar float intrinsic operations can be performed more
29    efficiently as C language float scalar operations or optimized to
30    use vector SIMD operations. We recommend this for new applications.
31 */
32 #error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef EMMINTRIN_H_
36 #define EMMINTRIN_H_
37 
38 #include <altivec.h>
39 
40 /* We need definitions from the SSE header files.  */
41 #include <xmmintrin.h>
42 
43 /* SSE2 */
44 typedef __vector double __v2df;
45 typedef __vector long long __v2di;
46 typedef __vector unsigned long long __v2du;
47 typedef __vector int __v4si;
48 typedef __vector unsigned int __v4su;
49 typedef __vector short __v8hi;
50 typedef __vector unsigned short __v8hu;
51 typedef __vector signed char __v16qi;
52 typedef __vector unsigned char __v16qu;
53 
54 /* The Intel API is flexible enough that we must allow aliasing with other
55    vector types, and their scalar components.  */
56 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
57 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
58 
59 /* Unaligned version of the same types.  */
60 typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
61 typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
62 
63 /* Define two value permute mask.  */
64 #define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
65 
66 /* Create a vector with element 0 as F and the rest zero.  */
67 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
68 _mm_set_sd (double __F)
69 {
70   return __extension__ (__m128d){ __F, 0.0 };
71 }
72 
73 /* Create a vector with both elements equal to F.  */
74 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 _mm_set1_pd (double __F)
76 {
77   return __extension__ (__m128d){ __F, __F };
78 }
79 
80 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81 _mm_set_pd1 (double __F)
82 {
83   return _mm_set1_pd (__F);
84 }
85 
86 /* Create a vector with the lower value X and upper value W.  */
87 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88 _mm_set_pd (double __W, double __X)
89 {
90   return __extension__ (__m128d){ __X, __W };
91 }
92 
93 /* Create a vector with the lower value W and upper value X.  */
94 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95 _mm_setr_pd (double __W, double __X)
96 {
97   return __extension__ (__m128d){ __W, __X };
98 }
99 
100 /* Create an undefined vector.  */
101 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102 _mm_undefined_pd (void)
103 {
104   __m128d __Y = __Y;
105   return __Y;
106 }
107 
108 /* Create a vector of zeros.  */
109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110 _mm_setzero_pd (void)
111 {
112   return (__m128d) vec_splats (0);
113 }
114 
115 /* Sets the low DPFP value of A from the low value of B.  */
116 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117 _mm_move_sd (__m128d __A, __m128d __B)
118 {
119   __v2df result = (__v2df) __A;
120   result [0] = ((__v2df) __B)[0];
121   return (__m128d) result;
122 }
123 
124 /* Load two DPFP values from P.  The address must be 16-byte aligned.  */
125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126 _mm_load_pd (double const *__P)
127 {
128   return ((__m128d)vec_ld(0, (__v16qu*)__P));
129 }
130 
131 /* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
132 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133 _mm_loadu_pd (double const *__P)
134 {
135   return (vec_vsx_ld(0, __P));
136 }
137 
138 /* Create a vector with all two elements equal to *P.  */
139 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140 _mm_load1_pd (double const *__P)
141 {
142   return (vec_splats (*__P));
143 }
144 
145 /* Create a vector with element 0 as *P and the rest zero.  */
146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147 _mm_load_sd (double const *__P)
148 {
149   return _mm_set_sd (*__P);
150 }
151 
152 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_load_pd1 (double const *__P)
154 {
155   return _mm_load1_pd (__P);
156 }
157 
158 /* Load two DPFP values in reverse order.  The address must be aligned.  */
159 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_loadr_pd (double const *__P)
161 {
162   __v2df __tmp = _mm_load_pd (__P);
163   return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
164 }
165 
166 /* Store two DPFP values.  The address must be 16-byte aligned.  */
167 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168 _mm_store_pd (double *__P, __m128d __A)
169 {
170   vec_st((__v16qu)__A, 0, (__v16qu*)__P);
171 }
172 
173 /* Store two DPFP values.  The address need not be 16-byte aligned.  */
174 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175 _mm_storeu_pd (double *__P, __m128d __A)
176 {
177   *(__m128d_u *)__P = __A;
178 }
179 
180 /* Stores the lower DPFP value.  */
181 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182 _mm_store_sd (double *__P, __m128d __A)
183 {
184   *__P = ((__v2df)__A)[0];
185 }
186 
187 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_cvtsd_f64 (__m128d __A)
189 {
190   return ((__v2df)__A)[0];
191 }
192 
193 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194 _mm_storel_pd (double *__P, __m128d __A)
195 {
196   _mm_store_sd (__P, __A);
197 }
198 
199 /* Stores the upper DPFP value.  */
200 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_storeh_pd (double *__P, __m128d __A)
202 {
203   *__P = ((__v2df)__A)[1];
204 }
205 /* Store the lower DPFP value across two words.
206    The address must be 16-byte aligned.  */
207 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208 _mm_store1_pd (double *__P, __m128d __A)
209 {
210   _mm_store_pd (__P, vec_splat (__A, 0));
211 }
212 
213 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214 _mm_store_pd1 (double *__P, __m128d __A)
215 {
216   _mm_store1_pd (__P, __A);
217 }
218 
219 /* Store two DPFP values in reverse order.  The address must be aligned.  */
220 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_storer_pd (double *__P, __m128d __A)
222 {
223   _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
224 }
225 
226 /* Intel intrinsic.  */
227 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_cvtsi128_si64 (__m128i __A)
229 {
230   return ((__v2di)__A)[0];
231 }
232 
233 /* Microsoft intrinsic.  */
234 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235 _mm_cvtsi128_si64x (__m128i __A)
236 {
237   return ((__v2di)__A)[0];
238 }
239 
240 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 _mm_add_pd (__m128d __A, __m128d __B)
242 {
243   return (__m128d) ((__v2df)__A + (__v2df)__B);
244 }
245 
246 /* Add the lower double-precision (64-bit) floating-point element in
247    a and b, store the result in the lower element of dst, and copy
248    the upper element from a to the upper element of dst. */
249 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250 _mm_add_sd (__m128d __A, __m128d __B)
251 {
252   __A[0] = __A[0] + __B[0];
253   return (__A);
254 }
255 
256 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _mm_sub_pd (__m128d __A, __m128d __B)
258 {
259   return (__m128d) ((__v2df)__A - (__v2df)__B);
260 }
261 
262 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263 _mm_sub_sd (__m128d __A, __m128d __B)
264 {
265   __A[0] = __A[0] - __B[0];
266   return (__A);
267 }
268 
269 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270 _mm_mul_pd (__m128d __A, __m128d __B)
271 {
272   return (__m128d) ((__v2df)__A * (__v2df)__B);
273 }
274 
275 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276 _mm_mul_sd (__m128d __A, __m128d __B)
277 {
278   __A[0] = __A[0] * __B[0];
279   return (__A);
280 }
281 
282 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283 _mm_div_pd (__m128d __A, __m128d __B)
284 {
285   return (__m128d) ((__v2df)__A / (__v2df)__B);
286 }
287 
288 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289 _mm_div_sd (__m128d __A, __m128d __B)
290 {
291   __A[0] = __A[0] / __B[0];
292   return (__A);
293 }
294 
295 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296 _mm_sqrt_pd (__m128d __A)
297 {
298   return (vec_sqrt (__A));
299 }
300 
301 /* Return pair {sqrt (B[0]), A[1]}.  */
302 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303 _mm_sqrt_sd (__m128d __A, __m128d __B)
304 {
305   __v2df c;
306   c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
307   return (__m128d) _mm_setr_pd (c[0], __A[1]);
308 }
309 
310 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311 _mm_min_pd (__m128d __A, __m128d __B)
312 {
313   return (vec_min (__A, __B));
314 }
315 
316 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317 _mm_min_sd (__m128d __A, __m128d __B)
318 {
319   __v2df a, b, c;
320   a = vec_splats (__A[0]);
321   b = vec_splats (__B[0]);
322   c = vec_min (a, b);
323   return (__m128d) _mm_setr_pd (c[0], __A[1]);
324 }
325 
326 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327 _mm_max_pd (__m128d __A, __m128d __B)
328 {
329   return (vec_max (__A, __B));
330 }
331 
332 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333 _mm_max_sd (__m128d __A, __m128d __B)
334 {
335   __v2df a, b, c;
336   a = vec_splats (__A[0]);
337   b = vec_splats (__B[0]);
338   c = vec_max (a, b);
339   return (__m128d) _mm_setr_pd (c[0], __A[1]);
340 }
341 
342 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343 _mm_cmpeq_pd (__m128d __A, __m128d __B)
344 {
345   return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
346 }
347 
348 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349 _mm_cmplt_pd (__m128d __A, __m128d __B)
350 {
351   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
352 }
353 
354 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355 _mm_cmple_pd (__m128d __A, __m128d __B)
356 {
357   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
358 }
359 
360 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _mm_cmpgt_pd (__m128d __A, __m128d __B)
362 {
363   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
364 }
365 
366 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367 _mm_cmpge_pd (__m128d __A, __m128d __B)
368 {
369   return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
370 }
371 
372 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373 _mm_cmpneq_pd (__m128d __A, __m128d __B)
374 {
375   __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
376   return ((__m128d)vec_nor (temp, temp));
377 }
378 
379 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
381 {
382   return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
383 }
384 
385 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386 _mm_cmpnle_pd (__m128d __A, __m128d __B)
387 {
388   return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
389 }
390 
391 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392 _mm_cmpngt_pd (__m128d __A, __m128d __B)
393 {
394   return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
395 }
396 
397 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398 _mm_cmpnge_pd (__m128d __A, __m128d __B)
399 {
400   return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
401 }
402 
403 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404 _mm_cmpord_pd (__m128d __A, __m128d __B)
405 {
406 #if _ARCH_PWR8
407   __v2du c, d;
408   /* Compare against self will return false (0's) if NAN.  */
409   c = (__v2du)vec_cmpeq (__A, __A);
410   d = (__v2du)vec_cmpeq (__B, __B);
411 #else
412   __v2du a, b;
413   __v2du c, d;
414   const __v2du double_exp_mask  = {0x7ff0000000000000, 0x7ff0000000000000};
415   a = (__v2du)vec_abs ((__v2df)__A);
416   b = (__v2du)vec_abs ((__v2df)__B);
417   c = (__v2du)vec_cmpgt (double_exp_mask, a);
418   d = (__v2du)vec_cmpgt (double_exp_mask, b);
419 #endif
420   /* A != NAN and B != NAN.  */
421   return ((__m128d)vec_and(c, d));
422 }
423 
424 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425 _mm_cmpunord_pd (__m128d __A, __m128d __B)
426 {
427 #if _ARCH_PWR8
428   __v2du c, d;
429   /* Compare against self will return false (0's) if NAN.  */
430   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
431   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
432   /* A == NAN OR B == NAN converts too:
433      NOT(A != NAN) OR NOT(B != NAN).  */
434   c = vec_nor (c, c);
435   return ((__m128d)vec_orc(c, d));
436 #else
437   __v2du c, d;
438   /* Compare against self will return false (0's) if NAN.  */
439   c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
440   d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
441   /* Convert the true ('1's) is NAN.  */
442   c = vec_nor (c, c);
443   d = vec_nor (d, d);
444   return ((__m128d)vec_or(c, d));
445 #endif
446 }
447 
448 extern __inline  __m128d  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_cmpeq_sd(__m128d  __A, __m128d  __B)
450 {
451   __v2df a, b, c;
452   /* PowerISA VSX does not allow partial (for just lower double)
453      results. So to insure we don't generate spurious exceptions
454      (from the upper double values) we splat the lower double
455      before we do the operation. */
456   a = vec_splats (__A[0]);
457   b = vec_splats (__B[0]);
458   c = (__v2df) vec_cmpeq(a, b);
459   /* Then we merge the lower double result with the original upper
460      double from __A.  */
461   return (__m128d) _mm_setr_pd (c[0], __A[1]);
462 }
463 
464 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465 _mm_cmplt_sd (__m128d __A, __m128d __B)
466 {
467   __v2df a, b, c;
468   a = vec_splats (__A[0]);
469   b = vec_splats (__B[0]);
470   c = (__v2df) vec_cmplt(a, b);
471   return (__m128d) _mm_setr_pd (c[0], __A[1]);
472 }
473 
474 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 _mm_cmple_sd (__m128d __A, __m128d __B)
476 {
477   __v2df a, b, c;
478   a = vec_splats (__A[0]);
479   b = vec_splats (__B[0]);
480   c = (__v2df) vec_cmple(a, b);
481   return (__m128d) _mm_setr_pd (c[0], __A[1]);
482 }
483 
484 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485 _mm_cmpgt_sd (__m128d __A, __m128d __B)
486 {
487   __v2df a, b, c;
488   a = vec_splats (__A[0]);
489   b = vec_splats (__B[0]);
490   c = (__v2df) vec_cmpgt(a, b);
491   return (__m128d) _mm_setr_pd (c[0], __A[1]);
492 }
493 
494 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 _mm_cmpge_sd (__m128d __A, __m128d __B)
496 {
497   __v2df a, b, c;
498   a = vec_splats (__A[0]);
499   b = vec_splats (__B[0]);
500   c = (__v2df) vec_cmpge(a, b);
501   return (__m128d) _mm_setr_pd (c[0], __A[1]);
502 }
503 
504 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505 _mm_cmpneq_sd (__m128d __A, __m128d __B)
506 {
507   __v2df a, b, c;
508   a = vec_splats (__A[0]);
509   b = vec_splats (__B[0]);
510   c = (__v2df) vec_cmpeq(a, b);
511   c = vec_nor (c, c);
512   return (__m128d) _mm_setr_pd (c[0], __A[1]);
513 }
514 
515 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516 _mm_cmpnlt_sd (__m128d __A, __m128d __B)
517 {
518   __v2df a, b, c;
519   a = vec_splats (__A[0]);
520   b = vec_splats (__B[0]);
521   /* Not less than is just greater than or equal.  */
522   c = (__v2df) vec_cmpge(a, b);
523   return (__m128d) _mm_setr_pd (c[0], __A[1]);
524 }
525 
526 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 _mm_cmpnle_sd (__m128d __A, __m128d __B)
528 {
529   __v2df a, b, c;
530   a = vec_splats (__A[0]);
531   b = vec_splats (__B[0]);
532   /* Not less than or equal is just greater than.  */
533   c = (__v2df) vec_cmpge(a, b);
534   return (__m128d) _mm_setr_pd (c[0], __A[1]);
535 }
536 
537 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 _mm_cmpngt_sd (__m128d __A, __m128d __B)
539 {
540   __v2df a, b, c;
541   a = vec_splats (__A[0]);
542   b = vec_splats (__B[0]);
543   /* Not greater than is just less than or equal.  */
544   c = (__v2df) vec_cmple(a, b);
545   return (__m128d) _mm_setr_pd (c[0], __A[1]);
546 }
547 
548 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _mm_cmpnge_sd (__m128d __A, __m128d __B)
550 {
551   __v2df a, b, c;
552   a = vec_splats (__A[0]);
553   b = vec_splats (__B[0]);
554   /* Not greater than or equal is just less than.  */
555   c = (__v2df) vec_cmplt(a, b);
556   return (__m128d) _mm_setr_pd (c[0], __A[1]);
557 }
558 
559 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
560 _mm_cmpord_sd (__m128d __A, __m128d __B)
561 {
562   __v2df r;
563   r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
564   return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
565 }
566 
567 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568 _mm_cmpunord_sd (__m128d __A, __m128d __B)
569 {
570   __v2df r;
571   r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
572   return (__m128d) _mm_setr_pd (r[0], __A[1]);
573 }
574 
575 /* FIXME
576    The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
577    exactly the same because GCC for PowerPC only generates unordered
578    compares (scalar and vector).
579    Technically __mm_comieq_sp et all should be using the ordered
580    compare and signal for QNaNs.  The __mm_ucomieq_sd et all should
581    be OK.   */
582 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583 _mm_comieq_sd (__m128d __A, __m128d __B)
584 {
585   return (__A[0] == __B[0]);
586 }
587 
588 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_comilt_sd (__m128d __A, __m128d __B)
590 {
591   return (__A[0] < __B[0]);
592 }
593 
594 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595 _mm_comile_sd (__m128d __A, __m128d __B)
596 {
597   return (__A[0] <= __B[0]);
598 }
599 
600 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601 _mm_comigt_sd (__m128d __A, __m128d __B)
602 {
603   return (__A[0] > __B[0]);
604 }
605 
606 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_comige_sd (__m128d __A, __m128d __B)
608 {
609   return (__A[0] >= __B[0]);
610 }
611 
612 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613 _mm_comineq_sd (__m128d __A, __m128d __B)
614 {
615   return (__A[0] != __B[0]);
616 }
617 
618 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619 _mm_ucomieq_sd (__m128d __A, __m128d __B)
620 {
621 	return (__A[0] == __B[0]);
622 }
623 
624 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_ucomilt_sd (__m128d __A, __m128d __B)
626 {
627 	return (__A[0] < __B[0]);
628 }
629 
630 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631 _mm_ucomile_sd (__m128d __A, __m128d __B)
632 {
633 	return (__A[0] <= __B[0]);
634 }
635 
636 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637 _mm_ucomigt_sd (__m128d __A, __m128d __B)
638 {
639 	return (__A[0] > __B[0]);
640 }
641 
642 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_ucomige_sd (__m128d __A, __m128d __B)
644 {
645 	return (__A[0] >= __B[0]);
646 }
647 
648 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649 _mm_ucomineq_sd (__m128d __A, __m128d __B)
650 {
651   return (__A[0] != __B[0]);
652 }
653 
654 /* Create a vector of Qi, where i is the element number.  */
655 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656 _mm_set_epi64x (long long __q1, long long __q0)
657 {
658   return __extension__ (__m128i)(__v2di){ __q0, __q1 };
659 }
660 
661 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662 _mm_set_epi64 (__m64 __q1,  __m64 __q0)
663 {
664   return _mm_set_epi64x ((long long)__q1, (long long)__q0);
665 }
666 
667 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 _mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
669 {
670   return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
671 }
672 
673 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674 _mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
675 	       short __q3, short __q2, short __q1, short __q0)
676 {
677   return __extension__ (__m128i)(__v8hi){
678     __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
679 }
680 
681 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682 _mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
683 	      char __q11, char __q10, char __q09, char __q08,
684 	      char __q07, char __q06, char __q05, char __q04,
685 	      char __q03, char __q02, char __q01, char __q00)
686 {
687   return __extension__ (__m128i)(__v16qi){
688     __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
689     __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
690   };
691 }
692 
693 /* Set all of the elements of the vector to A.  */
694 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695 _mm_set1_epi64x (long long __A)
696 {
697   return _mm_set_epi64x (__A, __A);
698 }
699 
700 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_set1_epi64 (__m64 __A)
702 {
703   return _mm_set_epi64 (__A, __A);
704 }
705 
706 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707 _mm_set1_epi32 (int __A)
708 {
709   return _mm_set_epi32 (__A, __A, __A, __A);
710 }
711 
712 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713 _mm_set1_epi16 (short __A)
714 {
715   return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
716 }
717 
718 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719 _mm_set1_epi8 (char __A)
720 {
721   return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
722 		       __A, __A, __A, __A, __A, __A, __A, __A);
723 }
724 
725 /* Create a vector of Qi, where i is the element number.
726    The parameter order is reversed from the _mm_set_epi* functions.  */
727 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728 _mm_setr_epi64 (__m64 __q0, __m64 __q1)
729 {
730   return _mm_set_epi64 (__q1, __q0);
731 }
732 
733 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
735 {
736   return _mm_set_epi32 (__q3, __q2, __q1, __q0);
737 }
738 
739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740 _mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
741 	        short __q4, short __q5, short __q6, short __q7)
742 {
743   return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
744 }
745 
746 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747 _mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
748 	       char __q04, char __q05, char __q06, char __q07,
749 	       char __q08, char __q09, char __q10, char __q11,
750 	       char __q12, char __q13, char __q14, char __q15)
751 {
752   return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
753 		       __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
754 }
755 
756 /* Create a vector with element 0 as *P and the rest zero.  */
757 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758 _mm_load_si128 (__m128i const *__P)
759 {
760   return *__P;
761 }
762 
763 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764 _mm_loadu_si128 (__m128i_u const *__P)
765 {
766   return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
767 }
768 
769 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770 _mm_loadl_epi64 (__m128i_u const *__P)
771 {
772   return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
773 }
774 
775 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776 _mm_store_si128 (__m128i *__P, __m128i __B)
777 {
778   vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
779 }
780 
781 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782 _mm_storeu_si128 (__m128i_u *__P, __m128i __B)
783 {
784   *__P = __B;
785 }
786 
787 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788 _mm_storel_epi64 (__m128i_u *__P, __m128i __B)
789 {
790   *(long long *)__P = ((__v2di)__B)[0];
791 }
792 
793 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_movepi64_pi64 (__m128i_u __B)
795 {
796   return (__m64) ((__v2di)__B)[0];
797 }
798 
799 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_movpi64_epi64 (__m64 __A)
801 {
802   return _mm_set_epi64 ((__m64)0LL, __A);
803 }
804 
805 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_move_epi64 (__m128i __A)
807 {
808   return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
809 }
810 
811 /* Create an undefined vector.  */
812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813 _mm_undefined_si128 (void)
814 {
815   __m128i __Y = __Y;
816   return __Y;
817 }
818 
819 /* Create a vector of zeros.  */
820 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821 _mm_setzero_si128 (void)
822 {
823   return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
824 }
825 
826 #ifdef _ARCH_PWR8
827 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 _mm_cvtepi32_pd (__m128i __A)
829 {
830   __v2di val;
831   /* For LE need to generate Vector Unpack Low Signed Word.
832      Which is generated from unpackh.  */
833   val = (__v2di)vec_unpackh ((__v4si)__A);
834 
835   return (__m128d)vec_ctf (val, 0);
836 }
837 #endif
838 
839 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840 _mm_cvtepi32_ps (__m128i __A)
841 {
842   return ((__m128)vec_ctf((__v4si)__A, 0));
843 }
844 
845 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846 _mm_cvtpd_epi32 (__m128d __A)
847 {
848   __v2df rounded = vec_rint (__A);
849   __v4si result, temp;
850   const __v4si vzero =
851     { 0, 0, 0, 0 };
852 
853   /* VSX Vector truncate Double-Precision to integer and Convert to
854    Signed Integer Word format with Saturate.  */
855   __asm__(
856       "xvcvdpsxws %x0,%x1"
857       : "=wa" (temp)
858       : "wa" (rounded)
859       : );
860 
861 #ifdef _ARCH_PWR8
862   temp = vec_mergeo (temp, temp);
863   result = (__v4si) vec_vpkudum ((__vector long long) temp,
864 				 (__vector long long) vzero);
865 #else
866   {
867     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
868 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
869     result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
870   }
871 #endif
872   return (__m128i) result;
873 }
874 
875 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876 _mm_cvtpd_pi32 (__m128d __A)
877 {
878   __m128i result = _mm_cvtpd_epi32(__A);
879 
880   return (__m64) result[0];
881 }
882 
883 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884 _mm_cvtpd_ps (__m128d __A)
885 {
886   __v4sf result;
887   __v4si temp;
888   const __v4si vzero = { 0, 0, 0, 0 };
889 
890   __asm__(
891       "xvcvdpsp %x0,%x1"
892       : "=wa" (temp)
893       : "wa" (__A)
894       : );
895 
896 #ifdef _ARCH_PWR8
897   temp = vec_mergeo (temp, temp);
898   result = (__v4sf) vec_vpkudum ((__vector long long) temp,
899 				 (__vector long long) vzero);
900 #else
901   {
902     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
903 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
904     result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
905   }
906 #endif
907   return ((__m128)result);
908 }
909 
910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911 _mm_cvttpd_epi32 (__m128d __A)
912 {
913   __v4si result;
914   __v4si temp;
915   const __v4si vzero = { 0, 0, 0, 0 };
916 
917   /* VSX Vector truncate Double-Precision to integer and Convert to
918    Signed Integer Word format with Saturate.  */
919   __asm__(
920       "xvcvdpsxws %x0,%x1"
921       : "=wa" (temp)
922       : "wa" (__A)
923       : );
924 
925 #ifdef _ARCH_PWR8
926   temp = vec_mergeo (temp, temp);
927   result = (__v4si) vec_vpkudum ((__vector long long) temp,
928 				 (__vector long long) vzero);
929 #else
930   {
931     const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
932 	0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
933     result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
934   }
935 #endif
936 
937   return ((__m128i) result);
938 }
939 
940 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941 _mm_cvttpd_pi32 (__m128d __A)
942 {
943   __m128i result = _mm_cvttpd_epi32 (__A);
944 
945   return (__m64) result[0];
946 }
947 
948 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
949 _mm_cvtsi128_si32 (__m128i __A)
950 {
951   return ((__v4si)__A)[0];
952 }
953 
954 #ifdef _ARCH_PWR8
955 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956 _mm_cvtpi32_pd (__m64 __A)
957 {
958   __v4si temp;
959   __v2di tmp2;
960   __v2df result;
961 
962   temp = (__v4si)vec_splats (__A);
963   tmp2 = (__v2di)vec_unpackl (temp);
964   result = vec_ctf ((__vector signed long long) tmp2, 0);
965   return (__m128d)result;
966 }
967 #endif
968 
969 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970 _mm_cvtps_epi32 (__m128 __A)
971 {
972   __v4sf rounded;
973   __v4si result;
974 
975   rounded = vec_rint((__v4sf) __A);
976   result = vec_cts (rounded, 0);
977   return (__m128i) result;
978 }
979 
980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981 _mm_cvttps_epi32 (__m128 __A)
982 {
983   __v4si result;
984 
985   result = vec_cts ((__v4sf) __A, 0);
986   return (__m128i) result;
987 }
988 
989 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_cvtps_pd (__m128 __A)
991 {
992   /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
993 #ifdef vec_doubleh
994   return (__m128d) vec_doubleh ((__v4sf)__A);
995 #else
996   /* Otherwise the compiler is not current and so need to generate the
997      equivalent code.  */
998   __v4sf a = (__v4sf)__A;
999   __v4sf temp;
1000   __v2df result;
1001 #ifdef __LITTLE_ENDIAN__
1002   /* The input float values are in elements {[0], [1]} but the convert
1003      instruction needs them in elements {[1], [3]}, So we use two
1004      shift left double vector word immediates to get the elements
1005      lined up.  */
1006   temp = __builtin_vsx_xxsldwi (a, a, 3);
1007   temp = __builtin_vsx_xxsldwi (a, temp, 2);
1008 #else
1009   /* The input float values are in elements {[0], [1]} but the convert
1010      instruction needs them in elements {[0], [2]}, So we use two
1011      shift left double vector word immediates to get the elements
1012      lined up.  */
1013   temp = vec_vmrghw (a, a);
1014 #endif
1015   __asm__(
1016       " xvcvspdp %x0,%x1"
1017       : "=wa" (result)
1018       : "wa" (temp)
1019       : );
1020   return (__m128d) result;
1021 #endif
1022 }
1023 
1024 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025 _mm_cvtsd_si32 (__m128d __A)
1026 {
1027   __v2df rounded = vec_rint((__v2df) __A);
1028   int result = ((__v2df)rounded)[0];
1029 
1030   return result;
1031 }
1032 /* Intel intrinsic.  */
1033 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_cvtsd_si64 (__m128d __A)
1035 {
1036   __v2df rounded = vec_rint ((__v2df) __A );
1037   long long result = ((__v2df) rounded)[0];
1038 
1039   return result;
1040 }
1041 
1042 /* Microsoft intrinsic.  */
1043 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044 _mm_cvtsd_si64x (__m128d __A)
1045 {
1046   return _mm_cvtsd_si64 ((__v2df)__A);
1047 }
1048 
1049 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050 _mm_cvttsd_si32 (__m128d __A)
1051 {
1052   int result = ((__v2df)__A)[0];
1053 
1054   return result;
1055 }
1056 
1057 /* Intel intrinsic.  */
1058 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059 _mm_cvttsd_si64 (__m128d __A)
1060 {
1061   long long result = ((__v2df)__A)[0];
1062 
1063   return result;
1064 }
1065 
1066 /* Microsoft intrinsic.  */
1067 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068 _mm_cvttsd_si64x (__m128d __A)
1069 {
1070   return _mm_cvttsd_si64 (__A);
1071 }
1072 
1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_cvtsd_ss (__m128 __A, __m128d __B)
1075 {
1076   __v4sf result = (__v4sf)__A;
1077 
1078 #ifdef __LITTLE_ENDIAN__
1079   __v4sf temp_s;
1080   /* Copy double element[0] to element [1] for conversion.  */
1081   __v2df temp_b = vec_splat((__v2df)__B, 0);
1082 
1083   /* Pre-rotate __A left 3 (logically right 1) elements.  */
1084   result = __builtin_vsx_xxsldwi (result, result, 3);
1085   /* Convert double to single float scalar in a vector.  */
1086   __asm__(
1087       "xscvdpsp %x0,%x1"
1088       : "=wa" (temp_s)
1089       : "wa" (temp_b)
1090       : );
1091   /* Shift the resulting scalar into vector element [0].  */
1092   result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1093 #else
1094   result [0] = ((__v2df)__B)[0];
1095 #endif
1096   return (__m128) result;
1097 }
1098 
1099 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _mm_cvtsi32_sd (__m128d __A, int __B)
1101 {
1102   __v2df result = (__v2df)__A;
1103   double db = __B;
1104   result [0] = db;
1105   return (__m128d)result;
1106 }
1107 
1108 /* Intel intrinsic.  */
1109 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110 _mm_cvtsi64_sd (__m128d __A, long long __B)
1111 {
1112   __v2df result = (__v2df)__A;
1113   double db = __B;
1114   result [0] = db;
1115   return (__m128d)result;
1116 }
1117 
1118 /* Microsoft intrinsic.  */
1119 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120 _mm_cvtsi64x_sd (__m128d __A, long long __B)
1121 {
1122   return _mm_cvtsi64_sd (__A, __B);
1123 }
1124 
1125 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm_cvtss_sd (__m128d __A, __m128 __B)
1127 {
1128 #ifdef __LITTLE_ENDIAN__
1129   /* Use splat to move element [0] into position for the convert. */
1130   __v4sf temp = vec_splat ((__v4sf)__B, 0);
1131   __v2df res;
1132   /* Convert single float scalar to double in a vector.  */
1133   __asm__(
1134       "xscvspdp %x0,%x1"
1135       : "=wa" (res)
1136       : "wa" (temp)
1137       : );
1138   return (__m128d) vec_mergel (res, (__v2df)__A);
1139 #else
1140   __v2df res = (__v2df)__A;
1141   res [0] = ((__v4sf)__B) [0];
1142   return (__m128d) res;
1143 #endif
1144 }
1145 
1146 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1148 {
1149   __vector double result;
1150   const int litmsk = __mask & 0x3;
1151 
1152   if (litmsk == 0)
1153     result = vec_mergeh (__A, __B);
1154 #if __GNUC__ < 6
1155   else if (litmsk == 1)
1156     result = vec_xxpermdi (__B, __A, 2);
1157   else if (litmsk == 2)
1158     result = vec_xxpermdi (__B, __A, 1);
1159 #else
1160   else if (litmsk == 1)
1161     result = vec_xxpermdi (__A, __B, 2);
1162   else if (litmsk == 2)
1163     result = vec_xxpermdi (__A, __B, 1);
1164 #endif
1165   else
1166     result = vec_mergel (__A, __B);
1167 
1168   return result;
1169 }
1170 
1171 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172 _mm_unpackhi_pd (__m128d __A, __m128d __B)
1173 {
1174   return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1175 }
1176 
1177 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178 _mm_unpacklo_pd (__m128d __A, __m128d __B)
1179 {
1180   return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1181 }
1182 
1183 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm_loadh_pd (__m128d __A, double const *__B)
1185 {
1186   __v2df result = (__v2df)__A;
1187   result [1] = *__B;
1188   return (__m128d)result;
1189 }
1190 
1191 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192 _mm_loadl_pd (__m128d __A, double const *__B)
1193 {
1194   __v2df result = (__v2df)__A;
1195   result [0] = *__B;
1196   return (__m128d)result;
1197 }
1198 
1199 #ifdef _ARCH_PWR8
1200 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1201 
1202 /* Creates a 2-bit mask from the most significant bits of the DPFP values.  */
1203 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1204 _mm_movemask_pd (__m128d  __A)
1205 {
1206   __vector unsigned long long result;
1207   static const __vector unsigned int perm_mask =
1208     {
1209 #ifdef __LITTLE_ENDIAN__
1210 	0x80800040, 0x80808080, 0x80808080, 0x80808080
1211 #else
1212       0x80808080, 0x80808080, 0x80808080, 0x80804000
1213 #endif
1214     };
1215 
1216   result = ((__vector unsigned long long)
1217 	    vec_vbpermq ((__vector unsigned char) __A,
1218 			 (__vector unsigned char) perm_mask));
1219 
1220 #ifdef __LITTLE_ENDIAN__
1221   return result[1];
1222 #else
1223   return result[0];
1224 #endif
1225 }
1226 #endif /* _ARCH_PWR8 */
1227 
1228 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229 _mm_packs_epi16 (__m128i __A, __m128i __B)
1230 {
1231   return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1232 }
1233 
1234 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235 _mm_packs_epi32 (__m128i __A, __m128i __B)
1236 {
1237   return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1238 }
1239 
1240 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241 _mm_packus_epi16 (__m128i __A, __m128i __B)
1242 {
1243   return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1244 }
1245 
1246 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247 _mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1248 {
1249   return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1250 }
1251 
1252 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1254 {
1255   return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1256 }
1257 
1258 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259 _mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1260 {
1261   return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1262 }
1263 
1264 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265 _mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1266 {
1267   return (__m128i) vec_mergel ((__vector long long) __A,
1268 			       (__vector long long) __B);
1269 }
1270 
1271 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272 _mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1273 {
1274   return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1275 }
1276 
1277 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278 _mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1279 {
1280   return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1281 }
1282 
1283 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284 _mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1285 {
1286   return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1287 }
1288 
1289 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1291 {
1292   return (__m128i) vec_mergeh ((__vector long long) __A,
1293 			       (__vector long long) __B);
1294 }
1295 
1296 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_add_epi8 (__m128i __A, __m128i __B)
1298 {
1299   return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1300 }
1301 
1302 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303 _mm_add_epi16 (__m128i __A, __m128i __B)
1304 {
1305   return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1306 }
1307 
1308 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309 _mm_add_epi32 (__m128i __A, __m128i __B)
1310 {
1311   return (__m128i) ((__v4su)__A + (__v4su)__B);
1312 }
1313 
1314 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315 _mm_add_epi64 (__m128i __A, __m128i __B)
1316 {
1317   return (__m128i) ((__v2du)__A + (__v2du)__B);
1318 }
1319 
1320 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321 _mm_adds_epi8 (__m128i __A, __m128i __B)
1322 {
1323   return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1324 }
1325 
1326 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327 _mm_adds_epi16 (__m128i __A, __m128i __B)
1328 {
1329   return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1330 }
1331 
1332 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333 _mm_adds_epu8 (__m128i __A, __m128i __B)
1334 {
1335   return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1336 }
1337 
1338 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339 _mm_adds_epu16 (__m128i __A, __m128i __B)
1340 {
1341   return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1342 }
1343 
1344 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345 _mm_sub_epi8 (__m128i __A, __m128i __B)
1346 {
1347   return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1348 }
1349 
1350 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351 _mm_sub_epi16 (__m128i __A, __m128i __B)
1352 {
1353   return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1354 }
1355 
1356 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357 _mm_sub_epi32 (__m128i __A, __m128i __B)
1358 {
1359   return (__m128i) ((__v4su)__A - (__v4su)__B);
1360 }
1361 
1362 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363 _mm_sub_epi64 (__m128i __A, __m128i __B)
1364 {
1365   return (__m128i) ((__v2du)__A - (__v2du)__B);
1366 }
1367 
1368 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369 _mm_subs_epi8 (__m128i __A, __m128i __B)
1370 {
1371   return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1372 }
1373 
1374 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375 _mm_subs_epi16 (__m128i __A, __m128i __B)
1376 {
1377   return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1378 }
1379 
1380 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381 _mm_subs_epu8 (__m128i __A, __m128i __B)
1382 {
1383   return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1384 }
1385 
1386 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm_subs_epu16 (__m128i __A, __m128i __B)
1388 {
1389   return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1390 }
1391 
1392 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393 _mm_madd_epi16 (__m128i __A, __m128i __B)
1394 {
1395   __vector signed int zero = {0, 0, 0, 0};
1396 
1397   return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1398 }
1399 
1400 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
1402 {
1403   __vector signed int w0, w1;
1404 
1405   __vector unsigned char xform1 = {
1406 #ifdef __LITTLE_ENDIAN__
1407       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1408       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1409 #else
1410       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1411       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
1412 #endif
1413     };
1414 
1415   w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1416   w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1417   return (__m128i) vec_perm (w0, w1, xform1);
1418 }
1419 
1420 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421 _mm_mullo_epi16 (__m128i __A, __m128i __B)
1422 {
1423     return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1424 }
1425 
1426 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427 _mm_mul_su32 (__m64 __A, __m64 __B)
1428 {
1429   unsigned int a = __A;
1430   unsigned int b = __B;
1431 
1432   return ((__m64)a * (__m64)b);
1433 }
1434 
1435 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436 _mm_mul_epu32 (__m128i __A, __m128i __B)
1437 {
1438 #if __GNUC__ < 8
1439   __v2du result;
1440 
1441 #ifdef __LITTLE_ENDIAN__
1442   /* VMX Vector Multiply Odd Unsigned Word.  */
1443   __asm__(
1444       "vmulouw %0,%1,%2"
1445       : "=v" (result)
1446       : "v" (__A), "v" (__B)
1447       : );
1448 #else
1449   /* VMX Vector Multiply Even Unsigned Word.  */
1450   __asm__(
1451       "vmuleuw %0,%1,%2"
1452       : "=v" (result)
1453       : "v" (__A), "v" (__B)
1454       : );
1455 #endif
1456   return (__m128i) result;
1457 #else
1458   return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1459 #endif
1460 }
1461 
1462 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463 _mm_slli_epi16 (__m128i __A, int __B)
1464 {
1465   __v8hu lshift;
1466   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1467 
1468   if (__B >= 0 && __B < 16)
1469     {
1470       if (__builtin_constant_p(__B))
1471 	lshift = (__v8hu) vec_splat_s16(__B);
1472       else
1473 	lshift = vec_splats ((unsigned short) __B);
1474 
1475       result = vec_sl ((__v8hi) __A, lshift);
1476     }
1477 
1478   return (__m128i) result;
1479 }
1480 
1481 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482 _mm_slli_epi32 (__m128i __A, int __B)
1483 {
1484   __v4su lshift;
1485   __v4si result = { 0, 0, 0, 0 };
1486 
1487   if (__B >= 0 && __B < 32)
1488     {
1489       if (__builtin_constant_p(__B) && __B < 16)
1490 	lshift = (__v4su) vec_splat_s32(__B);
1491       else
1492 	lshift = vec_splats ((unsigned int) __B);
1493 
1494       result = vec_sl ((__v4si) __A, lshift);
1495     }
1496 
1497   return (__m128i) result;
1498 }
1499 
1500 #ifdef _ARCH_PWR8
1501 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502 _mm_slli_epi64 (__m128i __A, int __B)
1503 {
1504   __v2du lshift;
1505   __v2di result = { 0, 0 };
1506 
1507   if (__B >= 0 && __B < 64)
1508     {
1509       if (__builtin_constant_p(__B) && __B < 16)
1510 	lshift = (__v2du) vec_splat_s32(__B);
1511       else
1512 	lshift = (__v2du) vec_splats ((unsigned int) __B);
1513 
1514       result = vec_sl ((__v2di) __A, lshift);
1515     }
1516 
1517   return (__m128i) result;
1518 }
1519 #endif
1520 
1521 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522 _mm_srai_epi16 (__m128i __A, int __B)
1523 {
1524   __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1525   __v8hi result;
1526 
1527   if (__B < 16)
1528     {
1529       if (__builtin_constant_p(__B))
1530 	rshift = (__v8hu) vec_splat_s16(__B);
1531       else
1532 	rshift = vec_splats ((unsigned short) __B);
1533     }
1534   result = vec_sra ((__v8hi) __A, rshift);
1535 
1536   return (__m128i) result;
1537 }
1538 
1539 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540 _mm_srai_epi32 (__m128i __A, int __B)
1541 {
1542   __v4su rshift = { 31, 31, 31, 31 };
1543   __v4si result;
1544 
1545   if (__B < 32)
1546     {
1547       if (__builtin_constant_p(__B))
1548 	{
1549 	  if (__B < 16)
1550 	      rshift = (__v4su) vec_splat_s32(__B);
1551 	    else
1552 	      rshift = (__v4su) vec_splats((unsigned int)__B);
1553 	}
1554       else
1555 	rshift = vec_splats ((unsigned int) __B);
1556     }
1557   result = vec_sra ((__v4si) __A, rshift);
1558 
1559   return (__m128i) result;
1560 }
1561 
1562 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1563 _mm_bslli_si128 (__m128i __A, const int __N)
1564 {
1565   __v16qu result;
1566   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1567 
1568   if (__N < 16)
1569     result = vec_sld ((__v16qu) __A, zeros, __N);
1570   else
1571     result = zeros;
1572 
1573   return (__m128i) result;
1574 }
1575 
1576 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577 _mm_bsrli_si128 (__m128i __A, const int __N)
1578 {
1579   __v16qu result;
1580   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1581 
1582   if (__N < 16)
1583 #ifdef __LITTLE_ENDIAN__
1584     if (__builtin_constant_p(__N))
1585       /* Would like to use Vector Shift Left Double by Octet
1586 	 Immediate here to use the immediate form and avoid
1587 	 load of __N * 8 value into a separate VR.  */
1588       result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1589     else
1590 #endif
1591       {
1592 	__v16qu shift = vec_splats((unsigned char)(__N*8));
1593 #ifdef __LITTLE_ENDIAN__
1594 	result = vec_sro ((__v16qu)__A, shift);
1595 #else
1596 	result = vec_slo ((__v16qu)__A, shift);
1597 #endif
1598       }
1599   else
1600     result = zeros;
1601 
1602   return (__m128i) result;
1603 }
1604 
1605 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606 _mm_srli_si128 (__m128i __A, const int __N)
1607 {
1608   return _mm_bsrli_si128 (__A, __N);
1609 }
1610 
1611 extern __inline  __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1612 _mm_slli_si128 (__m128i __A, const int _imm5)
1613 {
1614   __v16qu result;
1615   const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1616 
1617   if (_imm5 < 16)
1618 #ifdef __LITTLE_ENDIAN__
1619     result = vec_sld ((__v16qu) __A, zeros, _imm5);
1620 #else
1621     result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1622 #endif
1623   else
1624     result = zeros;
1625 
1626   return (__m128i) result;
1627 }
1628 
1629 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1630 
1631 _mm_srli_epi16 (__m128i  __A, int __B)
1632 {
1633   __v8hu rshift;
1634   __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1635 
1636   if (__B < 16)
1637     {
1638       if (__builtin_constant_p(__B))
1639 	rshift = (__v8hu) vec_splat_s16(__B);
1640       else
1641 	rshift = vec_splats ((unsigned short) __B);
1642 
1643       result = vec_sr ((__v8hi) __A, rshift);
1644     }
1645 
1646   return (__m128i) result;
1647 }
1648 
1649 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650 _mm_srli_epi32 (__m128i __A, int __B)
1651 {
1652   __v4su rshift;
1653   __v4si result = { 0, 0, 0, 0 };
1654 
1655   if (__B < 32)
1656     {
1657       if (__builtin_constant_p(__B))
1658 	{
1659 	  if (__B < 16)
1660 	      rshift = (__v4su) vec_splat_s32(__B);
1661 	    else
1662 	      rshift = (__v4su) vec_splats((unsigned int)__B);
1663 	}
1664       else
1665 	rshift = vec_splats ((unsigned int) __B);
1666 
1667       result = vec_sr ((__v4si) __A, rshift);
1668     }
1669 
1670   return (__m128i) result;
1671 }
1672 
1673 #ifdef _ARCH_PWR8
1674 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675 _mm_srli_epi64 (__m128i __A, int __B)
1676 {
1677   __v2du rshift;
1678   __v2di result = { 0, 0 };
1679 
1680   if (__B < 64)
1681     {
1682       if (__builtin_constant_p(__B))
1683 	{
1684 	  if (__B < 16)
1685 	      rshift = (__v2du) vec_splat_s32(__B);
1686 	    else
1687 	      rshift = (__v2du) vec_splats((unsigned long long)__B);
1688 	}
1689       else
1690 	rshift = (__v2du) vec_splats ((unsigned int) __B);
1691 
1692       result = vec_sr ((__v2di) __A, rshift);
1693     }
1694 
1695   return (__m128i) result;
1696 }
1697 #endif
1698 
1699 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700 _mm_sll_epi16 (__m128i __A, __m128i __B)
1701 {
1702   __v8hu lshift;
1703   __vector __bool short shmask;
1704   const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1705   __v8hu result;
1706 
1707 #ifdef __LITTLE_ENDIAN__
1708   lshift = vec_splat ((__v8hu) __B, 0);
1709 #else
1710   lshift = vec_splat ((__v8hu) __B, 3);
1711 #endif
1712   shmask = vec_cmple (lshift, shmax);
1713   result = vec_sl ((__v8hu) __A, lshift);
1714   result = vec_sel ((__v8hu) shmask, result, shmask);
1715 
1716   return (__m128i) result;
1717 }
1718 
1719 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720 _mm_sll_epi32 (__m128i __A, __m128i __B)
1721 {
1722   __v4su lshift;
1723   __vector __bool int shmask;
1724   const __v4su shmax = { 32, 32, 32, 32 };
1725   __v4su result;
1726 #ifdef __LITTLE_ENDIAN__
1727   lshift = vec_splat ((__v4su) __B, 0);
1728 #else
1729   lshift = vec_splat ((__v4su) __B, 1);
1730 #endif
1731   shmask = vec_cmplt (lshift, shmax);
1732   result = vec_sl ((__v4su) __A, lshift);
1733   result = vec_sel ((__v4su) shmask, result, shmask);
1734 
1735   return (__m128i) result;
1736 }
1737 
1738 #ifdef _ARCH_PWR8
1739 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740 _mm_sll_epi64 (__m128i __A, __m128i __B)
1741 {
1742   __v2du lshift;
1743   __vector __bool long long shmask;
1744   const __v2du shmax = { 64, 64 };
1745   __v2du result;
1746 
1747   lshift = vec_splat ((__v2du) __B, 0);
1748   shmask = vec_cmplt (lshift, shmax);
1749   result = vec_sl ((__v2du) __A, lshift);
1750   result = vec_sel ((__v2du) shmask, result, shmask);
1751 
1752   return (__m128i) result;
1753 }
1754 #endif
1755 
1756 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757 _mm_sra_epi16 (__m128i __A, __m128i __B)
1758 {
1759   const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1760   __v8hu rshift;
1761   __v8hi result;
1762 
1763 #ifdef __LITTLE_ENDIAN__
1764   rshift = vec_splat ((__v8hu)__B, 0);
1765 #else
1766   rshift = vec_splat ((__v8hu)__B, 3);
1767 #endif
1768   rshift = vec_min (rshift, rshmax);
1769   result = vec_sra ((__v8hi) __A, rshift);
1770 
1771   return (__m128i) result;
1772 }
1773 
1774 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775 _mm_sra_epi32 (__m128i __A, __m128i __B)
1776 {
1777   const __v4su rshmax = { 31, 31, 31, 31 };
1778   __v4su rshift;
1779   __v4si result;
1780 
1781 #ifdef __LITTLE_ENDIAN__
1782   rshift = vec_splat ((__v4su)__B, 0);
1783 #else
1784   rshift = vec_splat ((__v4su)__B, 1);
1785 #endif
1786   rshift = vec_min (rshift, rshmax);
1787   result = vec_sra ((__v4si) __A, rshift);
1788 
1789   return (__m128i) result;
1790 }
1791 
1792 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1793 _mm_srl_epi16 (__m128i __A, __m128i __B)
1794 {
1795   __v8hu rshift;
1796   __vector __bool short shmask;
1797   const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1798   __v8hu result;
1799 
1800 #ifdef __LITTLE_ENDIAN__
1801   rshift = vec_splat ((__v8hu) __B, 0);
1802 #else
1803   rshift = vec_splat ((__v8hu) __B, 3);
1804 #endif
1805   shmask = vec_cmple (rshift, shmax);
1806   result = vec_sr ((__v8hu) __A, rshift);
1807   result = vec_sel ((__v8hu) shmask, result, shmask);
1808 
1809   return (__m128i) result;
1810 }
1811 
1812 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813 _mm_srl_epi32 (__m128i __A, __m128i __B)
1814 {
1815   __v4su rshift;
1816   __vector __bool int shmask;
1817   const __v4su shmax = { 32, 32, 32, 32 };
1818   __v4su result;
1819 
1820 #ifdef __LITTLE_ENDIAN__
1821   rshift = vec_splat ((__v4su) __B, 0);
1822 #else
1823   rshift = vec_splat ((__v4su) __B, 1);
1824 #endif
1825   shmask = vec_cmplt (rshift, shmax);
1826   result = vec_sr ((__v4su) __A, rshift);
1827   result = vec_sel ((__v4su) shmask, result, shmask);
1828 
1829   return (__m128i) result;
1830 }
1831 
1832 #ifdef _ARCH_PWR8
1833 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1834 _mm_srl_epi64 (__m128i __A, __m128i __B)
1835 {
1836   __v2du rshift;
1837   __vector __bool long long shmask;
1838   const __v2du shmax = { 64, 64 };
1839   __v2du result;
1840 
1841   rshift = vec_splat ((__v2du) __B, 0);
1842   shmask = vec_cmplt (rshift, shmax);
1843   result = vec_sr ((__v2du) __A, rshift);
1844   result = vec_sel ((__v2du) shmask, result, shmask);
1845 
1846   return (__m128i) result;
1847 }
1848 #endif
1849 
1850 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851 _mm_and_pd (__m128d __A, __m128d __B)
1852 {
1853   return (vec_and ((__v2df) __A, (__v2df) __B));
1854 }
1855 
1856 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857 _mm_andnot_pd (__m128d __A, __m128d __B)
1858 {
1859   return (vec_andc ((__v2df) __B, (__v2df) __A));
1860 }
1861 
1862 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863 _mm_or_pd (__m128d __A, __m128d __B)
1864 {
1865   return (vec_or ((__v2df) __A, (__v2df) __B));
1866 }
1867 
1868 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869 _mm_xor_pd (__m128d __A, __m128d __B)
1870 {
1871   return (vec_xor ((__v2df) __A, (__v2df) __B));
1872 }
1873 
1874 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875 _mm_and_si128 (__m128i __A, __m128i __B)
1876 {
1877   return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1878 }
1879 
1880 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881 _mm_andnot_si128 (__m128i __A, __m128i __B)
1882 {
1883   return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1884 }
1885 
1886 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887 _mm_or_si128 (__m128i __A, __m128i __B)
1888 {
1889   return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1890 }
1891 
1892 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893 _mm_xor_si128 (__m128i __A, __m128i __B)
1894 {
1895   return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1896 }
1897 
1898 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899 _mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1900 {
1901   return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1902 }
1903 
1904 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905 _mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1906 {
1907   return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1908 }
1909 
1910 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911 _mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1912 {
1913   return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1914 }
1915 
1916 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917 _mm_cmplt_epi8 (__m128i __A, __m128i __B)
1918 {
1919   return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1920 }
1921 
1922 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923 _mm_cmplt_epi16 (__m128i __A, __m128i __B)
1924 {
1925   return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1926 }
1927 
1928 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1929 _mm_cmplt_epi32 (__m128i __A, __m128i __B)
1930 {
1931   return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1932 }
1933 
1934 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1935 _mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1936 {
1937   return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1938 }
1939 
1940 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1941 _mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1942 {
1943   return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1944 }
1945 
1946 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1947 _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1948 {
1949   return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1950 }
1951 
1952 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1953 _mm_extract_epi16 (__m128i const __A, int const __N)
1954 {
1955   return (unsigned short) ((__v8hi)__A)[__N & 7];
1956 }
1957 
1958 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1959 _mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1960 {
1961   __v8hi result = (__v8hi)__A;
1962 
1963   result [(__N & 7)] = __D;
1964 
1965   return (__m128i) result;
1966 }
1967 
1968 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969 _mm_max_epi16 (__m128i __A, __m128i __B)
1970 {
1971   return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1972 }
1973 
1974 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975 _mm_max_epu8 (__m128i __A, __m128i __B)
1976 {
1977   return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1978 }
1979 
1980 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1981 _mm_min_epi16 (__m128i __A, __m128i __B)
1982 {
1983   return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1984 }
1985 
1986 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1987 _mm_min_epu8 (__m128i __A, __m128i __B)
1988 {
1989   return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1990 }
1991 
1992 
1993 #ifdef _ARCH_PWR8
1994 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1995 
1996 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1997 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1998 _mm_movemask_epi8 (__m128i __A)
1999 {
2000   __vector unsigned long long result;
2001   static const __vector unsigned char perm_mask =
2002     {
2003 	0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2004 	0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2005     };
2006 
2007   result = ((__vector unsigned long long)
2008 	    vec_vbpermq ((__vector unsigned char) __A,
2009 			 (__vector unsigned char) perm_mask));
2010 
2011 #ifdef __LITTLE_ENDIAN__
2012   return result[1];
2013 #else
2014   return result[0];
2015 #endif
2016 }
2017 #endif /* _ARCH_PWR8 */
2018 
2019 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2020 _mm_mulhi_epu16 (__m128i __A, __m128i __B)
2021 {
2022   __v4su w0, w1;
2023   __v16qu xform1 = {
2024 #ifdef __LITTLE_ENDIAN__
2025       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
2026       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
2027 #else
2028       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
2029       0x08, 0x09, 0x18, 0x19,  0x0C, 0x0D, 0x1C, 0x1D
2030 #endif
2031     };
2032 
2033   w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2034   w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2035   return (__m128i) vec_perm (w0, w1, xform1);
2036 }
2037 
2038 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2039 _mm_shufflehi_epi16 (__m128i __A, const int __mask)
2040 {
2041   unsigned long element_selector_98 = __mask & 0x03;
2042   unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2043   unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2044   unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2045   static const unsigned short permute_selectors[4] =
2046     {
2047 #ifdef __LITTLE_ENDIAN__
2048 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2049 #else
2050 	      0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2051 #endif
2052     };
2053   __v2du pmask =
2054 #ifdef __LITTLE_ENDIAN__
2055       { 0x1716151413121110UL,  0UL};
2056 #else
2057       { 0x1011121314151617UL,  0UL};
2058 #endif
2059   __m64_union t;
2060   __v2du a, r;
2061 
2062   t.as_short[0] = permute_selectors[element_selector_98];
2063   t.as_short[1] = permute_selectors[element_selector_BA];
2064   t.as_short[2] = permute_selectors[element_selector_DC];
2065   t.as_short[3] = permute_selectors[element_selector_FE];
2066   pmask[1] = t.as_m64;
2067   a = (__v2du)__A;
2068   r = vec_perm (a, a, (__vector unsigned char)pmask);
2069   return (__m128i) r;
2070 }
2071 
2072 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2073 _mm_shufflelo_epi16 (__m128i __A, const int __mask)
2074 {
2075   unsigned long element_selector_10 = __mask & 0x03;
2076   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2077   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2078   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2079   static const unsigned short permute_selectors[4] =
2080     {
2081 #ifdef __LITTLE_ENDIAN__
2082 	      0x0100, 0x0302, 0x0504, 0x0706
2083 #else
2084 	      0x0001, 0x0203, 0x0405, 0x0607
2085 #endif
2086     };
2087   __v2du pmask =
2088 #ifdef __LITTLE_ENDIAN__
2089                  { 0UL,  0x1f1e1d1c1b1a1918UL};
2090 #else
2091                  { 0UL,  0x18191a1b1c1d1e1fUL};
2092 #endif
2093   __m64_union t;
2094   __v2du a, r;
2095   t.as_short[0] = permute_selectors[element_selector_10];
2096   t.as_short[1] = permute_selectors[element_selector_32];
2097   t.as_short[2] = permute_selectors[element_selector_54];
2098   t.as_short[3] = permute_selectors[element_selector_76];
2099   pmask[0] = t.as_m64;
2100   a = (__v2du)__A;
2101   r = vec_perm (a, a, (__vector unsigned char)pmask);
2102   return (__m128i) r;
2103 }
2104 
2105 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106 _mm_shuffle_epi32 (__m128i __A, const int __mask)
2107 {
2108   unsigned long element_selector_10 = __mask & 0x03;
2109   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2110   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2111   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2112   static const unsigned int permute_selectors[4] =
2113     {
2114 #ifdef __LITTLE_ENDIAN__
2115 	0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2116 #else
2117       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2118 #endif
2119     };
2120   __v4su t;
2121 
2122   t[0] = permute_selectors[element_selector_10];
2123   t[1] = permute_selectors[element_selector_32];
2124   t[2] = permute_selectors[element_selector_54] + 0x10101010;
2125   t[3] = permute_selectors[element_selector_76] + 0x10101010;
2126   return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2127 }
2128 
2129 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2130 _mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2131 {
2132   __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2133   __v16qu mask, tmp;
2134   __m128i_u *p = (__m128i_u*)__C;
2135 
2136   tmp = (__v16qu)_mm_loadu_si128(p);
2137   mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2138   tmp = vec_sel (tmp, (__v16qu)__A, mask);
2139   _mm_storeu_si128 (p, (__m128i)tmp);
2140 }
2141 
2142 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2143 _mm_avg_epu8 (__m128i __A, __m128i __B)
2144 {
2145   return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2146 }
2147 
2148 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2149 _mm_avg_epu16 (__m128i __A, __m128i __B)
2150 {
2151   return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2152 }
2153 
2154 
2155 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156 _mm_sad_epu8 (__m128i __A, __m128i __B)
2157 {
2158   __v16qu a, b;
2159   __v16qu vmin, vmax, vabsdiff;
2160   __v4si vsum;
2161   const __v4su zero = { 0, 0, 0, 0 };
2162   __v4si result;
2163 
2164   a = (__v16qu) __A;
2165   b = (__v16qu) __B;
2166   vmin = vec_min (a, b);
2167   vmax = vec_max (a, b);
2168   vabsdiff = vec_sub (vmax, vmin);
2169   /* Sum four groups of bytes into integers.  */
2170   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2171   /* Sum across four integers with two integer results.  */
2172   result = vec_sum2s (vsum, (__vector signed int) zero);
2173   /* Rotate the sums into the correct position.  */
2174 #ifdef __LITTLE_ENDIAN__
2175   result = vec_sld (result, result, 4);
2176 #else
2177   result = vec_sld (result, result, 6);
2178 #endif
2179   /* Rotate the sums into the correct position.  */
2180   return (__m128i) result;
2181 }
2182 
2183 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2184 _mm_stream_si32 (int *__A, int __B)
2185 {
2186   /* Use the data cache block touch for store transient.  */
2187   __asm__ (
2188     "dcbtstt 0,%0"
2189     :
2190     : "b" (__A)
2191     : "memory"
2192   );
2193   *__A = __B;
2194 }
2195 
2196 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2197 _mm_stream_si64 (long long int *__A, long long int __B)
2198 {
2199   /* Use the data cache block touch for store transient.  */
2200   __asm__ (
2201     "	dcbtstt	0,%0"
2202     :
2203     : "b" (__A)
2204     : "memory"
2205   );
2206   *__A = __B;
2207 }
2208 
2209 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2210 _mm_stream_si128 (__m128i *__A, __m128i __B)
2211 {
2212   /* Use the data cache block touch for store transient.  */
2213   __asm__ (
2214     "dcbtstt 0,%0"
2215     :
2216     : "b" (__A)
2217     : "memory"
2218   );
2219   *__A = __B;
2220 }
2221 
2222 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2223 _mm_stream_pd (double *__A, __m128d __B)
2224 {
2225   /* Use the data cache block touch for store transient.  */
2226   __asm__ (
2227     "dcbtstt 0,%0"
2228     :
2229     : "b" (__A)
2230     : "memory"
2231   );
2232   *(__m128d*)__A = __B;
2233 }
2234 
2235 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236 _mm_clflush (void const *__A)
2237 {
2238   /* Use the data cache block flush.  */
2239   __asm__ (
2240     "dcbf 0,%0"
2241     :
2242     : "b" (__A)
2243     : "memory"
2244   );
2245 }
2246 
2247 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248 _mm_lfence (void)
2249 {
2250   /* Use light weight sync for load to load ordering.  */
2251   __atomic_thread_fence (__ATOMIC_RELEASE);
2252 }
2253 
2254 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2255 _mm_mfence (void)
2256 {
2257   /* Use heavy weight sync for any to any ordering.  */
2258   __atomic_thread_fence (__ATOMIC_SEQ_CST);
2259 }
2260 
2261 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2262 _mm_cvtsi32_si128 (int __A)
2263 {
2264   return _mm_set_epi32 (0, 0, 0, __A);
2265 }
2266 
2267 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2268 _mm_cvtsi64_si128 (long long __A)
2269 {
2270   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2271 }
2272 
2273 /* Microsoft intrinsic.  */
2274 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2275 _mm_cvtsi64x_si128 (long long __A)
2276 {
2277   return __extension__ (__m128i)(__v2di){ __A, 0LL };
2278 }
2279 
2280 /* Casts between various SP, DP, INT vector types.  Note that these do no
2281    conversion of values, they just change the type.  */
2282 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2283 _mm_castpd_ps(__m128d __A)
2284 {
2285   return (__m128) __A;
2286 }
2287 
2288 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2289 _mm_castpd_si128(__m128d __A)
2290 {
2291   return (__m128i) __A;
2292 }
2293 
2294 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2295 _mm_castps_pd(__m128 __A)
2296 {
2297   return (__m128d) __A;
2298 }
2299 
2300 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2301 _mm_castps_si128(__m128 __A)
2302 {
2303   return (__m128i) __A;
2304 }
2305 
2306 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2307 _mm_castsi128_ps(__m128i __A)
2308 {
2309   return (__m128) __A;
2310 }
2311 
2312 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2313 _mm_castsi128_pd(__m128i __A)
2314 {
2315   return (__m128d) __A;
2316 }
2317 
2318 #endif /* EMMINTRIN_H_ */
2319