xref: /freebsd/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/xmmintrin.h (revision 6ba2210ee039f2f12878c217bcf058e9c8b26b29)
1 /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18    VMX/VSX ISA is a good match for vector float SIMD operations.
19    However scalar float operations in vector (XMM) registers require
20    the POWER8 VSX ISA (2.07) level. There are differences for data
21    format and placement of float scalars in the vector register, which
22    require extra steps to match SSE scalar float semantics on POWER.
23 
24    It should be noted that there's much difference between X86_64's
25    MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26    portable <fenv.h> instead of access MXSCR directly.
27 
28    Most SSE scalar float intrinsic operations can be performed more
29    efficiently as C language float scalar operations or optimized to
30    use vector SIMD operations. We recommend this for new applications. */
31 #error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
32 #endif
33 
34 #ifndef _XMMINTRIN_H_INCLUDED
35 #define _XMMINTRIN_H_INCLUDED
36 
37 #if defined(__linux__) && defined(__ppc64__)
38 
39 /* Define four value permute mask */
40 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
41 
42 #include <altivec.h>
43 
44 /* Avoid collisions between altivec.h and strict adherence to C++ and
45    C11 standards.  This should eventually be done inside altivec.h itself,
46    but only after testing a full distro build.  */
47 #if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48 				 (defined(__STDC_VERSION__) &&	\
49 				  __STDC_VERSION__ >= 201112L))
50 #undef vector
51 #undef pixel
52 #undef bool
53 #endif
54 
55 /* We need type definitions from the MMX header file.  */
56 #include <mmintrin.h>
57 
58 /* Get _mm_malloc () and _mm_free ().  */
59 #if __STDC_HOSTED__
60 #include <mm_malloc.h>
61 #endif
62 
63 /* The Intel API is flexible enough that we must allow aliasing with other
64    vector types, and their scalar components.  */
65 typedef vector float __m128 __attribute__((__may_alias__));
66 
67 /* Unaligned version of the same type.  */
68 typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
69 
70 /* Internal data types for implementing the intrinsics.  */
71 typedef vector float __v4sf;
72 
73 /* Create an undefined vector.  */
74 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 _mm_undefined_ps (void)
76 {
77   __m128 __Y = __Y;
78   return __Y;
79 }
80 
81 /* Create a vector of zeros.  */
82 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 _mm_setzero_ps (void)
84 {
85   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
86 }
87 
88 /* Load four SPFP values from P.  The address must be 16-byte aligned.  */
89 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90 _mm_load_ps (float const *__P)
91 {
92   return ((__m128)vec_ld(0, (__v4sf*)__P));
93 }
94 
95 /* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
96 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97 _mm_loadu_ps (float const *__P)
98 {
99   return (vec_vsx_ld(0, __P));
100 }
101 
102 /* Load four SPFP values in reverse order.  The address must be aligned.  */
103 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104 _mm_loadr_ps (float const *__P)
105 {
106   __v4sf   __tmp;
107   __m128 result;
108   static const __vector unsigned char permute_vector =
109     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
110 	0x17, 0x10, 0x11, 0x12, 0x13 };
111 
112   __tmp = vec_ld (0, (__v4sf *) __P);
113   result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
114   return result;
115 }
116 
117 /* Create a vector with all four elements equal to F.  */
118 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _mm_set1_ps (float __F)
120 {
121   return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
122 }
123 
124 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 _mm_set_ps1 (float __F)
126 {
127   return _mm_set1_ps (__F);
128 }
129 
130 /* Create the vector [Z Y X W].  */
131 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
133 {
134   return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
135 }
136 
137 /* Create the vector [W X Y Z].  */
138 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
140 {
141   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
142 }
143 
144 /* Store four SPFP values.  The address must be 16-byte aligned.  */
145 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146 _mm_store_ps (float *__P, __m128 __A)
147 {
148   vec_st((__v4sf)__A, 0, (__v4sf*)__P);
149 }
150 
151 /* Store four SPFP values.  The address need not be 16-byte aligned.  */
152 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153 _mm_storeu_ps (float *__P, __m128 __A)
154 {
155   *(__m128_u *)__P = __A;
156 }
157 
158 /* Store four SPFP values in reverse order.  The address must be aligned.  */
159 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160 _mm_storer_ps (float *__P, __m128 __A)
161 {
162   __v4sf   __tmp;
163   static const __vector unsigned char permute_vector =
164     { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
165 	0x17, 0x10, 0x11, 0x12, 0x13 };
166 
167   __tmp = (__m128) vec_perm (__A, __A, permute_vector);
168 
169   _mm_store_ps (__P, __tmp);
170 }
171 
172 /* Store the lower SPFP value across four words.  */
173 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174 _mm_store1_ps (float *__P, __m128 __A)
175 {
176   __v4sf __va = vec_splat((__v4sf)__A, 0);
177   _mm_store_ps (__P, __va);
178 }
179 
180 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181 _mm_store_ps1 (float *__P, __m128 __A)
182 {
183   _mm_store1_ps (__P, __A);
184 }
185 
186 /* Create a vector with element 0 as F and the rest zero.  */
187 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188 _mm_set_ss (float __F)
189 {
190   return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
191 }
192 
193 /* Sets the low SPFP value of A from the low value of B.  */
194 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195 _mm_move_ss (__m128 __A, __m128 __B)
196 {
197   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
198 
199   return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
200 }
201 
202 /* Create a vector with element 0 as *P and the rest zero.  */
203 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204 _mm_load_ss (float const *__P)
205 {
206   return _mm_set_ss (*__P);
207 }
208 
209 /* Stores the lower SPFP value.  */
210 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211 _mm_store_ss (float *__P, __m128 __A)
212 {
213   *__P = ((__v4sf)__A)[0];
214 }
215 
216 /* Perform the respective operation on the lower SPFP (single-precision
217    floating-point) values of A and B; the upper three SPFP values are
218    passed through from A.  */
219 
220 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221 _mm_add_ss (__m128 __A, __m128 __B)
222 {
223 #ifdef _ARCH_PWR7
224   __m128 a, b, c;
225   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
226   /* PowerISA VSX does not allow partial (for just lower double)
227      results. So to insure we don't generate spurious exceptions
228      (from the upper double values) we splat the lower double
229      before we to the operation.  */
230   a = vec_splat (__A, 0);
231   b = vec_splat (__B, 0);
232   c = a + b;
233   /* Then we merge the lower float result with the original upper
234      float elements from __A.  */
235   return (vec_sel (__A, c, mask));
236 #else
237   __A[0] = __A[0] + __B[0];
238   return (__A);
239 #endif
240 }
241 
242 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243 _mm_sub_ss (__m128 __A, __m128 __B)
244 {
245 #ifdef _ARCH_PWR7
246   __m128 a, b, c;
247   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
248   /* PowerISA VSX does not allow partial (for just lower double)
249      results. So to insure we don't generate spurious exceptions
250      (from the upper double values) we splat the lower double
251      before we to the operation.  */
252   a = vec_splat (__A, 0);
253   b = vec_splat (__B, 0);
254   c = a - b;
255   /* Then we merge the lower float result with the original upper
256      float elements from __A.  */
257   return (vec_sel (__A, c, mask));
258 #else
259   __A[0] = __A[0] - __B[0];
260   return (__A);
261 #endif
262 }
263 
264 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_mul_ss (__m128 __A, __m128 __B)
266 {
267 #ifdef _ARCH_PWR7
268   __m128 a, b, c;
269   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
270   /* PowerISA VSX does not allow partial (for just lower double)
271      results. So to insure we don't generate spurious exceptions
272      (from the upper double values) we splat the lower double
273      before we to the operation.  */
274   a = vec_splat (__A, 0);
275   b = vec_splat (__B, 0);
276   c = a * b;
277   /* Then we merge the lower float result with the original upper
278      float elements from __A.  */
279   return (vec_sel (__A, c, mask));
280 #else
281   __A[0] = __A[0] * __B[0];
282   return (__A);
283 #endif
284 }
285 
286 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287 _mm_div_ss (__m128 __A, __m128 __B)
288 {
289 #ifdef _ARCH_PWR7
290   __m128 a, b, c;
291   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
292   /* PowerISA VSX does not allow partial (for just lower double)
293      results. So to insure we don't generate spurious exceptions
294      (from the upper double values) we splat the lower double
295      before we to the operation.  */
296   a = vec_splat (__A, 0);
297   b = vec_splat (__B, 0);
298   c = a / b;
299   /* Then we merge the lower float result with the original upper
300      float elements from __A.  */
301   return (vec_sel (__A, c, mask));
302 #else
303   __A[0] = __A[0] / __B[0];
304   return (__A);
305 #endif
306 }
307 
308 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309 _mm_sqrt_ss (__m128 __A)
310 {
311   __m128 a, c;
312   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
313   /* PowerISA VSX does not allow partial (for just lower double)
314    * results. So to insure we don't generate spurious exceptions
315    * (from the upper double values) we splat the lower double
316    * before we to the operation. */
317   a = vec_splat (__A, 0);
318   c = vec_sqrt (a);
319   /* Then we merge the lower float result with the original upper
320    * float elements from __A.  */
321   return (vec_sel (__A, c, mask));
322 }
323 
324 /* Perform the respective operation on the four SPFP values in A and B.  */
325 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326 _mm_add_ps (__m128 __A, __m128 __B)
327 {
328   return (__m128) ((__v4sf)__A + (__v4sf)__B);
329 }
330 
331 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332 _mm_sub_ps (__m128 __A, __m128 __B)
333 {
334   return (__m128) ((__v4sf)__A - (__v4sf)__B);
335 }
336 
337 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _mm_mul_ps (__m128 __A, __m128 __B)
339 {
340   return (__m128) ((__v4sf)__A * (__v4sf)__B);
341 }
342 
343 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344 _mm_div_ps (__m128 __A, __m128 __B)
345 {
346   return (__m128) ((__v4sf)__A / (__v4sf)__B);
347 }
348 
349 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350 _mm_sqrt_ps (__m128 __A)
351 {
352   return (vec_sqrt ((__v4sf)__A));
353 }
354 
355 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356 _mm_rcp_ps (__m128 __A)
357 {
358   return (vec_re ((__v4sf)__A));
359 }
360 
361 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362 _mm_rsqrt_ps (__m128 __A)
363 {
364   return (vec_rsqrte (__A));
365 }
366 
367 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368 _mm_rcp_ss (__m128 __A)
369 {
370   __m128 a, c;
371   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
372   /* PowerISA VSX does not allow partial (for just lower double)
373    * results. So to insure we don't generate spurious exceptions
374    * (from the upper double values) we splat the lower double
375    * before we to the operation. */
376   a = vec_splat (__A, 0);
377   c = _mm_rcp_ps (a);
378   /* Then we merge the lower float result with the original upper
379    * float elements from __A.  */
380   return (vec_sel (__A, c, mask));
381 }
382 
383 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384 _mm_rsqrt_ss (__m128 __A)
385 {
386   __m128 a, c;
387   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
388   /* PowerISA VSX does not allow partial (for just lower double)
389    * results. So to insure we don't generate spurious exceptions
390    * (from the upper double values) we splat the lower double
391    * before we to the operation. */
392   a = vec_splat (__A, 0);
393   c = vec_rsqrte (a);
394   /* Then we merge the lower float result with the original upper
395    * float elements from __A.  */
396   return (vec_sel (__A, c, mask));
397 }
398 
399 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 _mm_min_ss (__m128 __A, __m128 __B)
401 {
402   __v4sf a, b, c;
403   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
404   /* PowerISA VSX does not allow partial (for just lower float)
405    * results. So to insure we don't generate spurious exceptions
406    * (from the upper float values) we splat the lower float
407    * before we to the operation. */
408   a = vec_splat ((__v4sf)__A, 0);
409   b = vec_splat ((__v4sf)__B, 0);
410   c = vec_min (a, b);
411   /* Then we merge the lower float result with the original upper
412    * float elements from __A.  */
413   return (vec_sel ((__v4sf)__A, c, mask));
414 }
415 
416 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417 _mm_max_ss (__m128 __A, __m128 __B)
418 {
419   __v4sf a, b, c;
420   static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
421   /* PowerISA VSX does not allow partial (for just lower float)
422    * results. So to insure we don't generate spurious exceptions
423    * (from the upper float values) we splat the lower float
424    * before we to the operation. */
425   a = vec_splat (__A, 0);
426   b = vec_splat (__B, 0);
427   c = vec_max (a, b);
428   /* Then we merge the lower float result with the original upper
429    * float elements from __A.  */
430   return (vec_sel ((__v4sf)__A, c, mask));
431 }
432 
433 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434 _mm_min_ps (__m128 __A, __m128 __B)
435 {
436   __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
437   return vec_sel (__B, __A, m);
438 }
439 
440 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441 _mm_max_ps (__m128 __A, __m128 __B)
442 {
443   __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
444   return vec_sel (__B, __A, m);
445 }
446 
447 /* Perform logical bit-wise operations on 128-bit values.  */
448 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449 _mm_and_ps (__m128 __A, __m128 __B)
450 {
451   return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
452 //  return __builtin_ia32_andps (__A, __B);
453 }
454 
455 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456 _mm_andnot_ps (__m128 __A, __m128 __B)
457 {
458   return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
459 }
460 
461 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462 _mm_or_ps (__m128 __A, __m128 __B)
463 {
464   return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
465 }
466 
467 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468 _mm_xor_ps (__m128 __A, __m128 __B)
469 {
470   return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
471 }
472 
473 /* Perform a comparison on the four SPFP values of A and B.  For each
474    element, if the comparison is true, place a mask of all ones in the
475    result, otherwise a mask of zeros.  */
476 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477 _mm_cmpeq_ps (__m128 __A, __m128 __B)
478 {
479   return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
480 }
481 
482 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 _mm_cmplt_ps (__m128 __A, __m128 __B)
484 {
485   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
486 }
487 
488 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 _mm_cmple_ps (__m128 __A, __m128 __B)
490 {
491   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
492 }
493 
494 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 _mm_cmpgt_ps (__m128 __A, __m128 __B)
496 {
497   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
498 }
499 
500 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 _mm_cmpge_ps (__m128 __A, __m128 __B)
502 {
503   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
504 }
505 
506 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 _mm_cmpneq_ps (__m128  __A, __m128  __B)
508 {
509   __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
510   return ((__m128)vec_nor (temp, temp));
511 }
512 
513 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
515 {
516   return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
517 }
518 
519 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520 _mm_cmpnle_ps (__m128 __A, __m128 __B)
521 {
522   return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
523 }
524 
525 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526 _mm_cmpngt_ps (__m128 __A, __m128 __B)
527 {
528   return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
529 }
530 
531 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532 _mm_cmpnge_ps (__m128 __A, __m128 __B)
533 {
534   return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
535 }
536 
537 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538 _mm_cmpord_ps (__m128  __A, __m128  __B)
539 {
540   __vector unsigned int a, b;
541   __vector unsigned int c, d;
542   static const __vector unsigned int float_exp_mask =
543     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
544 
545   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
546   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
547   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
548   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
549   return ((__m128 ) vec_and (c, d));
550 }
551 
552 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553 _mm_cmpunord_ps (__m128 __A, __m128 __B)
554 {
555   __vector unsigned int a, b;
556   __vector unsigned int c, d;
557   static const __vector unsigned int float_exp_mask =
558     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
559 
560   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
561   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
562   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
563   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
564   return ((__m128 ) vec_or (c, d));
565 }
566 
567 /* Perform a comparison on the lower SPFP values of A and B.  If the
568    comparison is true, place a mask of all ones in the result, otherwise a
569    mask of zeros.  The upper three SPFP values are passed through from A.  */
570 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
571 _mm_cmpeq_ss (__m128  __A, __m128  __B)
572 {
573   static const __vector unsigned int mask =
574     { 0xffffffff, 0, 0, 0 };
575   __v4sf a, b, c;
576   /* PowerISA VMX does not allow partial (for just element 0)
577    * results. So to insure we don't generate spurious exceptions
578    * (from the upper elements) we splat the lower float
579    * before we to the operation. */
580   a = vec_splat ((__v4sf) __A, 0);
581   b = vec_splat ((__v4sf) __B, 0);
582   c = (__v4sf) vec_cmpeq(a, b);
583   /* Then we merge the lower float result with the original upper
584    * float elements from __A.  */
585   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
586 }
587 
588 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589 _mm_cmplt_ss (__m128 __A, __m128 __B)
590 {
591   static const __vector unsigned int mask =
592     { 0xffffffff, 0, 0, 0 };
593   __v4sf a, b, c;
594   /* PowerISA VMX does not allow partial (for just element 0)
595    * results. So to insure we don't generate spurious exceptions
596    * (from the upper elements) we splat the lower float
597    * before we to the operation. */
598   a = vec_splat ((__v4sf) __A, 0);
599   b = vec_splat ((__v4sf) __B, 0);
600   c = (__v4sf) vec_cmplt(a, b);
601   /* Then we merge the lower float result with the original upper
602    * float elements from __A.  */
603   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
604 }
605 
606 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607 _mm_cmple_ss (__m128 __A, __m128 __B)
608 {
609   static const __vector unsigned int mask =
610     { 0xffffffff, 0, 0, 0 };
611   __v4sf a, b, c;
612   /* PowerISA VMX does not allow partial (for just element 0)
613    * results. So to insure we don't generate spurious exceptions
614    * (from the upper elements) we splat the lower float
615    * before we to the operation. */
616   a = vec_splat ((__v4sf) __A, 0);
617   b = vec_splat ((__v4sf) __B, 0);
618   c = (__v4sf) vec_cmple(a, b);
619   /* Then we merge the lower float result with the original upper
620    * float elements from __A.  */
621   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
622 }
623 
624 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625 _mm_cmpgt_ss (__m128 __A, __m128 __B)
626 {
627   static const __vector unsigned int mask =
628     { 0xffffffff, 0, 0, 0 };
629   __v4sf a, b, c;
630   /* PowerISA VMX does not allow partial (for just element 0)
631    * results. So to insure we don't generate spurious exceptions
632    * (from the upper elements) we splat the lower float
633    * before we to the operation. */
634   a = vec_splat ((__v4sf) __A, 0);
635   b = vec_splat ((__v4sf) __B, 0);
636   c = (__v4sf) vec_cmpgt(a, b);
637   /* Then we merge the lower float result with the original upper
638    * float elements from __A.  */
639   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
640 }
641 
642 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643 _mm_cmpge_ss (__m128 __A, __m128 __B)
644 {
645   static const __vector unsigned int mask =
646     { 0xffffffff, 0, 0, 0 };
647   __v4sf a, b, c;
648   /* PowerISA VMX does not allow partial (for just element 0)
649    * results. So to insure we don't generate spurious exceptions
650    * (from the upper elements) we splat the lower float
651    * before we to the operation. */
652   a = vec_splat ((__v4sf) __A, 0);
653   b = vec_splat ((__v4sf) __B, 0);
654   c = (__v4sf) vec_cmpge(a, b);
655   /* Then we merge the lower float result with the original upper
656    * float elements from __A.  */
657   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
658 }
659 
660 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661 _mm_cmpneq_ss (__m128 __A, __m128 __B)
662 {
663   static const __vector unsigned int mask =
664     { 0xffffffff, 0, 0, 0 };
665   __v4sf a, b, c;
666   /* PowerISA VMX does not allow partial (for just element 0)
667    * results. So to insure we don't generate spurious exceptions
668    * (from the upper elements) we splat the lower float
669    * before we to the operation. */
670   a = vec_splat ((__v4sf) __A, 0);
671   b = vec_splat ((__v4sf) __B, 0);
672   c = (__v4sf) vec_cmpeq(a, b);
673   c = vec_nor (c, c);
674   /* Then we merge the lower float result with the original upper
675    * float elements from __A.  */
676   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
677 }
678 
679 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680 _mm_cmpnlt_ss (__m128 __A, __m128 __B)
681 {
682   static const __vector unsigned int mask =
683     { 0xffffffff, 0, 0, 0 };
684   __v4sf a, b, c;
685   /* PowerISA VMX does not allow partial (for just element 0)
686    * results. So to insure we don't generate spurious exceptions
687    * (from the upper elements) we splat the lower float
688    * before we to the operation. */
689   a = vec_splat ((__v4sf) __A, 0);
690   b = vec_splat ((__v4sf) __B, 0);
691   c = (__v4sf) vec_cmpge(a, b);
692   /* Then we merge the lower float result with the original upper
693    * float elements from __A.  */
694   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
695 }
696 
697 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698 _mm_cmpnle_ss (__m128 __A, __m128 __B)
699 {
700   static const __vector unsigned int mask =
701     { 0xffffffff, 0, 0, 0 };
702   __v4sf a, b, c;
703   /* PowerISA VMX does not allow partial (for just element 0)
704    * results. So to insure we don't generate spurious exceptions
705    * (from the upper elements) we splat the lower float
706    * before we to the operation. */
707   a = vec_splat ((__v4sf) __A, 0);
708   b = vec_splat ((__v4sf) __B, 0);
709   c = (__v4sf) vec_cmpgt(a, b);
710   /* Then we merge the lower float result with the original upper
711    * float elements from __A.  */
712   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
713 }
714 
715 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716 _mm_cmpngt_ss (__m128 __A, __m128 __B)
717 {
718   static const __vector unsigned int mask =
719     { 0xffffffff, 0, 0, 0 };
720   __v4sf a, b, c;
721   /* PowerISA VMX does not allow partial (for just element 0)
722    * results. So to insure we don't generate spurious exceptions
723    * (from the upper elements) we splat the lower float
724    * before we to the operation. */
725   a = vec_splat ((__v4sf) __A, 0);
726   b = vec_splat ((__v4sf) __B, 0);
727   c = (__v4sf) vec_cmple(a, b);
728   /* Then we merge the lower float result with the original upper
729    * float elements from __A.  */
730   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
731 }
732 
733 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734 _mm_cmpnge_ss (__m128 __A, __m128 __B)
735 {
736   static const __vector unsigned int mask =
737     { 0xffffffff, 0, 0, 0 };
738   __v4sf a, b, c;
739   /* PowerISA VMX does not allow partial (for just element 0)
740    * results. So to insure we don't generate spurious exceptions
741    * (from the upper elements) we splat the lower float
742    * before we do the operation. */
743   a = vec_splat ((__v4sf) __A, 0);
744   b = vec_splat ((__v4sf) __B, 0);
745   c = (__v4sf) vec_cmplt(a, b);
746   /* Then we merge the lower float result with the original upper
747    * float elements from __A.  */
748   return ((__m128)vec_sel ((__v4sf)__A, c, mask));
749 }
750 
751 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752 _mm_cmpord_ss (__m128 __A, __m128 __B)
753 {
754   __vector unsigned int a, b;
755   __vector unsigned int c, d;
756   static const __vector unsigned int float_exp_mask =
757     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
758   static const __vector unsigned int mask =
759     { 0xffffffff, 0, 0, 0 };
760 
761   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
762   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
763   c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
764   d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
765   c = vec_and (c, d);
766   /* Then we merge the lower float result with the original upper
767    * float elements from __A.  */
768   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
769 }
770 
771 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772 _mm_cmpunord_ss (__m128 __A, __m128 __B)
773 {
774   __vector unsigned int a, b;
775   __vector unsigned int c, d;
776   static const __vector unsigned int float_exp_mask =
777     { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
778   static const __vector unsigned int mask =
779     { 0xffffffff, 0, 0, 0 };
780 
781   a = (__vector unsigned int) vec_abs ((__v4sf)__A);
782   b = (__vector unsigned int) vec_abs ((__v4sf)__B);
783   c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
784   d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
785   c = vec_or (c, d);
786   /* Then we merge the lower float result with the original upper
787    * float elements from __A.  */
788   return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
789 }
790 
791 /* Compare the lower SPFP values of A and B and return 1 if true
792    and 0 if false.  */
793 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794 _mm_comieq_ss (__m128 __A, __m128 __B)
795 {
796   return (__A[0] == __B[0]);
797 }
798 
799 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800 _mm_comilt_ss (__m128 __A, __m128 __B)
801 {
802   return (__A[0] < __B[0]);
803 }
804 
805 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806 _mm_comile_ss (__m128 __A, __m128 __B)
807 {
808   return (__A[0] <= __B[0]);
809 }
810 
811 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812 _mm_comigt_ss (__m128 __A, __m128 __B)
813 {
814   return (__A[0] > __B[0]);
815 }
816 
817 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818 _mm_comige_ss (__m128 __A, __m128 __B)
819 {
820   return (__A[0] >= __B[0]);
821 }
822 
823 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824 _mm_comineq_ss (__m128 __A, __m128 __B)
825 {
826   return (__A[0] != __B[0]);
827 }
828 
829 /* FIXME
830  * The __mm_ucomi??_ss implementations below are exactly the same as
831  * __mm_comi??_ss because GCC for PowerPC only generates unordered
832  * compares (scalar and vector).
833  * Technically __mm_comieq_ss et al should be using the ordered
834  * compare and signal for QNaNs.
835  * The __mm_ucomieq_sd et all should be OK, as is.
836  */
837 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838 _mm_ucomieq_ss (__m128 __A, __m128 __B)
839 {
840   return (__A[0] == __B[0]);
841 }
842 
843 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844 _mm_ucomilt_ss (__m128 __A, __m128 __B)
845 {
846   return (__A[0] < __B[0]);
847 }
848 
849 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850 _mm_ucomile_ss (__m128 __A, __m128 __B)
851 {
852   return (__A[0] <= __B[0]);
853 }
854 
855 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
856 _mm_ucomigt_ss (__m128 __A, __m128 __B)
857 {
858   return (__A[0] > __B[0]);
859 }
860 
861 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862 _mm_ucomige_ss (__m128 __A, __m128 __B)
863 {
864   return (__A[0] >= __B[0]);
865 }
866 
867 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868 _mm_ucomineq_ss (__m128 __A, __m128 __B)
869 {
870   return (__A[0] != __B[0]);
871 }
872 
873 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874 _mm_cvtss_f32 (__m128 __A)
875 {
876   return ((__v4sf)__A)[0];
877 }
878 
879 /* Convert the lower SPFP value to a 32-bit integer according to the current
880    rounding mode.  */
881 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
882 _mm_cvtss_si32 (__m128 __A)
883 {
884   __m64 res = 0;
885 #ifdef _ARCH_PWR8
886   double dtmp;
887   __asm__(
888 #ifdef __LITTLE_ENDIAN__
889       "xxsldwi %x0,%x0,%x0,3;\n"
890 #endif
891       "xscvspdp %x2,%x0;\n"
892       "fctiw  %2,%2;\n"
893       "mfvsrd  %1,%x2;\n"
894       : "+wa" (__A),
895         "=r" (res),
896         "=f" (dtmp)
897       : );
898 #else
899   res = __builtin_rint(__A[0]);
900 #endif
901   return (res);
902 }
903 
904 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905 _mm_cvt_ss2si (__m128 __A)
906 {
907   return _mm_cvtss_si32 (__A);
908 }
909 
910 /* Convert the lower SPFP value to a 32-bit integer according to the
911    current rounding mode.  */
912 
913 /* Intel intrinsic.  */
914 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915 _mm_cvtss_si64 (__m128 __A)
916 {
917   __m64 res = 0;
918 #ifdef _ARCH_PWR8
919   double dtmp;
920   __asm__(
921 #ifdef __LITTLE_ENDIAN__
922       "xxsldwi %x0,%x0,%x0,3;\n"
923 #endif
924       "xscvspdp %x2,%x0;\n"
925       "fctid  %2,%2;\n"
926       "mfvsrd  %1,%x2;\n"
927       : "+wa" (__A),
928         "=r" (res),
929         "=f" (dtmp)
930       : );
931 #else
932   res = __builtin_llrint(__A[0]);
933 #endif
934   return (res);
935 }
936 
937 /* Microsoft intrinsic.  */
938 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm_cvtss_si64x (__m128 __A)
940 {
941   return _mm_cvtss_si64 ((__v4sf) __A);
942 }
943 
944 /* Constants for use with _mm_prefetch.  */
945 enum _mm_hint
946 {
947   /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
948   _MM_HINT_ET0 = 7,
949   _MM_HINT_ET1 = 6,
950   _MM_HINT_T0 = 3,
951   _MM_HINT_T1 = 2,
952   _MM_HINT_T2 = 1,
953   _MM_HINT_NTA = 0
954 };
955 
956 /* Loads one cache line from address P to a location "closer" to the
957    processor.  The selector I specifies the type of prefetch operation.  */
958 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959 _mm_prefetch (const void *__P, enum _mm_hint __I)
960 {
961   /* Current PowerPC will ignores the hint parameters.  */
962   __builtin_prefetch (__P);
963 }
964 
965 /* Convert the two lower SPFP values to 32-bit integers according to the
966    current rounding mode.  Return the integers in packed form.  */
967 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
968 _mm_cvtps_pi32 (__m128 __A)
969 {
970   /* Splat two lower SPFP values to both halves.  */
971   __v4sf temp, rounded;
972   __vector unsigned long long result;
973 
974   /* Splat two lower SPFP values to both halves.  */
975   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
976   rounded = vec_rint(temp);
977   result = (__vector unsigned long long) vec_cts (rounded, 0);
978 
979   return (__m64) ((__vector long long) result)[0];
980 }
981 
982 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983 _mm_cvt_ps2pi (__m128 __A)
984 {
985   return _mm_cvtps_pi32 (__A);
986 }
987 
988 /* Truncate the lower SPFP value to a 32-bit integer.  */
989 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990 _mm_cvttss_si32 (__m128 __A)
991 {
992   /* Extract the lower float element.  */
993   float temp = __A[0];
994   /* truncate to 32-bit integer and return.  */
995   return temp;
996 }
997 
998 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999 _mm_cvtt_ss2si (__m128 __A)
1000 {
1001   return _mm_cvttss_si32 (__A);
1002 }
1003 
1004 /* Intel intrinsic.  */
1005 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006 _mm_cvttss_si64 (__m128 __A)
1007 {
1008   /* Extract the lower float element.  */
1009   float temp = __A[0];
1010   /* truncate to 32-bit integer and return.  */
1011   return temp;
1012 }
1013 
1014 /* Microsoft intrinsic.  */
1015 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016 _mm_cvttss_si64x (__m128 __A)
1017 {
1018   /* Extract the lower float element.  */
1019   float temp = __A[0];
1020   /* truncate to 32-bit integer and return.  */
1021   return temp;
1022 }
1023 
1024 /* Truncate the two lower SPFP values to 32-bit integers.  Return the
1025    integers in packed form.  */
1026 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027 _mm_cvttps_pi32 (__m128 __A)
1028 {
1029   __v4sf temp;
1030   __vector unsigned long long result;
1031 
1032   /* Splat two lower SPFP values to both halves.  */
1033   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1034   result = (__vector unsigned long long) vec_cts (temp, 0);
1035 
1036   return (__m64) ((__vector long long) result)[0];
1037 }
1038 
1039 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040 _mm_cvtt_ps2pi (__m128 __A)
1041 {
1042   return _mm_cvttps_pi32 (__A);
1043 }
1044 
1045 /* Convert B to a SPFP value and insert it as element zero in A.  */
1046 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047 _mm_cvtsi32_ss (__m128 __A, int __B)
1048 {
1049   float temp = __B;
1050   __A[0] = temp;
1051 
1052   return __A;
1053 }
1054 
1055 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056 _mm_cvt_si2ss (__m128 __A, int __B)
1057 {
1058   return _mm_cvtsi32_ss (__A, __B);
1059 }
1060 
1061 /* Convert B to a SPFP value and insert it as element zero in A.  */
1062 /* Intel intrinsic.  */
1063 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064 _mm_cvtsi64_ss (__m128 __A, long long __B)
1065 {
1066   float temp = __B;
1067   __A[0] = temp;
1068 
1069   return __A;
1070 }
1071 
1072 /* Microsoft intrinsic.  */
1073 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_cvtsi64x_ss (__m128 __A, long long __B)
1075 {
1076   return _mm_cvtsi64_ss (__A, __B);
1077 }
1078 
1079 /* Convert the two 32-bit values in B to SPFP form and insert them
1080    as the two lower elements in A.  */
1081 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082 _mm_cvtpi32_ps (__m128        __A, __m64        __B)
1083 {
1084   __vector signed int vm1;
1085   __vector float vf1;
1086 
1087   vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1088   vf1 = (__vector float) vec_ctf (vm1, 0);
1089 
1090   return ((__m128) (__vector unsigned long long)
1091     { ((__vector unsigned long long)vf1) [0],
1092 	((__vector unsigned long long)__A) [1]});
1093 }
1094 
1095 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096 _mm_cvt_pi2ps (__m128 __A, __m64 __B)
1097 {
1098   return _mm_cvtpi32_ps (__A, __B);
1099 }
1100 
1101 /* Convert the four signed 16-bit values in A to SPFP form.  */
1102 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103 _mm_cvtpi16_ps (__m64 __A)
1104 {
1105   __vector signed short vs8;
1106   __vector signed int vi4;
1107   __vector float vf1;
1108 
1109   vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1110   vi4 = vec_vupklsh (vs8);
1111   vf1 = (__vector float) vec_ctf (vi4, 0);
1112 
1113   return (__m128) vf1;
1114 }
1115 
1116 /* Convert the four unsigned 16-bit values in A to SPFP form.  */
1117 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1118 _mm_cvtpu16_ps (__m64 __A)
1119 {
1120   const __vector unsigned short zero =
1121     { 0, 0, 0, 0, 0, 0, 0, 0 };
1122   __vector unsigned short vs8;
1123   __vector unsigned int vi4;
1124   __vector float vf1;
1125 
1126   vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1127   vi4 = (__vector unsigned int) vec_mergel
1128 #ifdef __LITTLE_ENDIAN__
1129                                            (vs8, zero);
1130 #else
1131                                            (zero, vs8);
1132 #endif
1133   vf1 = (__vector float) vec_ctf (vi4, 0);
1134 
1135   return (__m128) vf1;
1136 }
1137 
1138 /* Convert the low four signed 8-bit values in A to SPFP form.  */
1139 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140 _mm_cvtpi8_ps (__m64 __A)
1141 {
1142   __vector signed char vc16;
1143   __vector signed short vs8;
1144   __vector signed int vi4;
1145   __vector float vf1;
1146 
1147   vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1148   vs8 = vec_vupkhsb (vc16);
1149   vi4 = vec_vupkhsh (vs8);
1150   vf1 = (__vector float) vec_ctf (vi4, 0);
1151 
1152   return (__m128) vf1;
1153 }
1154 
1155 /* Convert the low four unsigned 8-bit values in A to SPFP form.  */
1156 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157 
1158 _mm_cvtpu8_ps (__m64  __A)
1159 {
1160   const __vector unsigned char zero =
1161     { 0, 0, 0, 0, 0, 0, 0, 0 };
1162   __vector unsigned char vc16;
1163   __vector unsigned short vs8;
1164   __vector unsigned int vi4;
1165   __vector float vf1;
1166 
1167   vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1168 #ifdef __LITTLE_ENDIAN__
1169   vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1170   vi4 = (__vector unsigned int) vec_mergeh (vs8,
1171 					    (__vector unsigned short) zero);
1172 #else
1173   vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1174   vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1175                                             vs8);
1176 #endif
1177   vf1 = (__vector float) vec_ctf (vi4, 0);
1178 
1179   return (__m128) vf1;
1180 }
1181 
1182 /* Convert the four signed 32-bit values in A and B to SPFP form.  */
1183 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184 _mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1185 {
1186   __vector signed int vi4;
1187   __vector float vf4;
1188 
1189   vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1190   vf4 = (__vector float) vec_ctf (vi4, 0);
1191   return (__m128) vf4;
1192 }
1193 
1194 /* Convert the four SPFP values in A to four signed 16-bit integers.  */
1195 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196 _mm_cvtps_pi16 (__m128 __A)
1197 {
1198   __v4sf rounded;
1199   __vector signed int temp;
1200   __vector unsigned long long result;
1201 
1202   rounded = vec_rint(__A);
1203   temp = vec_cts (rounded, 0);
1204   result = (__vector unsigned long long) vec_pack (temp, temp);
1205 
1206   return (__m64) ((__vector long long) result)[0];
1207 }
1208 
1209 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
1210 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1211 _mm_cvtps_pi8 (__m128 __A)
1212 {
1213   __v4sf rounded;
1214   __vector signed int tmp_i;
1215   static const __vector signed int zero = {0, 0, 0, 0};
1216   __vector signed short tmp_s;
1217   __vector signed char res_v;
1218 
1219   rounded = vec_rint(__A);
1220   tmp_i = vec_cts (rounded, 0);
1221   tmp_s = vec_pack (tmp_i, zero);
1222   res_v = vec_pack (tmp_s, tmp_s);
1223   return (__m64) ((__vector long long) res_v)[0];
1224 }
1225 
1226 /* Selects four specific SPFP values from A and B based on MASK.  */
1227 extern __inline  __m128  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 
1229 _mm_shuffle_ps (__m128  __A, __m128  __B, int const __mask)
1230 {
1231   unsigned long element_selector_10 = __mask & 0x03;
1232   unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1233   unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1234   unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1235   static const unsigned int permute_selectors[4] =
1236     {
1237 #ifdef __LITTLE_ENDIAN__
1238       0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1239 #else
1240       0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1241 #endif
1242     };
1243   __vector unsigned int t;
1244 
1245   t[0] = permute_selectors[element_selector_10];
1246   t[1] = permute_selectors[element_selector_32];
1247   t[2] = permute_selectors[element_selector_54] + 0x10101010;
1248   t[3] = permute_selectors[element_selector_76] + 0x10101010;
1249   return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1250 }
1251 
1252 /* Selects and interleaves the upper two SPFP values from A and B.  */
1253 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1254 _mm_unpackhi_ps (__m128 __A, __m128 __B)
1255 {
1256   return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1257 }
1258 
1259 /* Selects and interleaves the lower two SPFP values from A and B.  */
1260 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261 _mm_unpacklo_ps (__m128 __A, __m128 __B)
1262 {
1263   return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1264 }
1265 
1266 /* Sets the upper two SPFP values with 64-bits of data loaded from P;
1267    the lower two values are passed through from A.  */
1268 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269 _mm_loadh_pi (__m128 __A, __m64 const *__P)
1270 {
1271   __vector unsigned long long __a = (__vector unsigned long long)__A;
1272   __vector unsigned long long __p = vec_splats(*__P);
1273   __a [1] = __p [1];
1274 
1275   return (__m128)__a;
1276 }
1277 
1278 /* Stores the upper two SPFP values of A into P.  */
1279 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280 _mm_storeh_pi (__m64 *__P, __m128 __A)
1281 {
1282   __vector unsigned long long __a = (__vector unsigned long long) __A;
1283 
1284   *__P = __a[1];
1285 }
1286 
1287 /* Moves the upper two values of B into the lower two values of A.  */
1288 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289 _mm_movehl_ps (__m128 __A, __m128 __B)
1290 {
1291   return (__m128) vec_mergel ((__vector unsigned long long)__B,
1292 			      (__vector unsigned long long)__A);
1293 }
1294 
1295 /* Moves the lower two values of B into the upper two values of A.  */
1296 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297 _mm_movelh_ps (__m128 __A, __m128 __B)
1298 {
1299   return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1300 			      (__vector unsigned long long)__B);
1301 }
1302 
1303 /* Sets the lower two SPFP values with 64-bits of data loaded from P;
1304    the upper two values are passed through from A.  */
1305 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306 _mm_loadl_pi (__m128 __A, __m64 const *__P)
1307 {
1308   __vector unsigned long long __a = (__vector unsigned long long)__A;
1309   __vector unsigned long long __p = vec_splats(*__P);
1310   __a [0] = __p [0];
1311 
1312   return (__m128)__a;
1313 }
1314 
1315 /* Stores the lower two SPFP values of A into P.  */
1316 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317 _mm_storel_pi (__m64 *__P, __m128 __A)
1318 {
1319   __vector unsigned long long __a = (__vector unsigned long long) __A;
1320 
1321   *__P = __a[0];
1322 }
1323 
1324 #ifdef _ARCH_PWR8
1325 /* Intrinsic functions that require PowerISA 2.07 minimum.  */
1326 
1327 /* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
1328 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329 _mm_movemask_ps (__m128  __A)
1330 {
1331   __vector unsigned long long result;
1332   static const __vector unsigned int perm_mask =
1333     {
1334 #ifdef __LITTLE_ENDIAN__
1335 	0x00204060, 0x80808080, 0x80808080, 0x80808080
1336 #else
1337       0x80808080, 0x80808080, 0x80808080, 0x00204060
1338 #endif
1339     };
1340 
1341   result = ((__vector unsigned long long)
1342 	    vec_vbpermq ((__vector unsigned char) __A,
1343 			 (__vector unsigned char) perm_mask));
1344 
1345 #ifdef __LITTLE_ENDIAN__
1346   return result[1];
1347 #else
1348   return result[0];
1349 #endif
1350 }
1351 #endif /* _ARCH_PWR8 */
1352 
1353 /* Create a vector with all four elements equal to *P.  */
1354 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355 _mm_load1_ps (float const *__P)
1356 {
1357   return _mm_set1_ps (*__P);
1358 }
1359 
1360 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361 _mm_load_ps1 (float const *__P)
1362 {
1363   return _mm_load1_ps (__P);
1364 }
1365 
1366 /* Extracts one of the four words of A.  The selector N must be immediate.  */
1367 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368 _mm_extract_pi16 (__m64 const __A, int const __N)
1369 {
1370   unsigned int shiftr = __N & 3;
1371 #ifdef __BIG_ENDIAN__
1372   shiftr = 3 - shiftr;
1373 #endif
1374 
1375   return ((__A >> (shiftr * 16)) & 0xffff);
1376 }
1377 
1378 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379 _m_pextrw (__m64 const __A, int const __N)
1380 {
1381   return _mm_extract_pi16 (__A, __N);
1382 }
1383 
1384 /* Inserts word D into one of four words of A.  The selector N must be
1385    immediate.  */
1386 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1388 {
1389   const int shiftl = (__N & 3) * 16;
1390   const __m64 shiftD = (const __m64) __D << shiftl;
1391   const __m64 mask = 0xffffUL << shiftl;
1392   __m64 result = (__A & (~mask)) | (shiftD & mask);
1393 
1394   return (result);
1395 }
1396 
1397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1398 _m_pinsrw (__m64 const __A, int const __D, int const __N)
1399 {
1400   return _mm_insert_pi16 (__A, __D, __N);
1401 }
1402 
1403 /* Compute the element-wise maximum of signed 16-bit values.  */
1404 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 
1406 _mm_max_pi16 (__m64 __A, __m64 __B)
1407 {
1408 #if _ARCH_PWR8
1409   __vector signed short a, b, r;
1410   __vector __bool short c;
1411 
1412   a = (__vector signed short)vec_splats (__A);
1413   b = (__vector signed short)vec_splats (__B);
1414   c = (__vector __bool short)vec_cmpgt (a, b);
1415   r = vec_sel (b, a, c);
1416   return (__m64) ((__vector long long) r)[0];
1417 #else
1418   __m64_union m1, m2, res;
1419 
1420   m1.as_m64 = __A;
1421   m2.as_m64 = __B;
1422 
1423   res.as_short[0] =
1424       (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1425   res.as_short[1] =
1426       (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1427   res.as_short[2] =
1428       (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1429   res.as_short[3] =
1430       (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1431 
1432   return (__m64) res.as_m64;
1433 #endif
1434 }
1435 
1436 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1437 _m_pmaxsw (__m64 __A, __m64 __B)
1438 {
1439   return _mm_max_pi16 (__A, __B);
1440 }
1441 
1442 /* Compute the element-wise maximum of unsigned 8-bit values.  */
1443 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1444 _mm_max_pu8 (__m64 __A, __m64 __B)
1445 {
1446 #if _ARCH_PWR8
1447   __vector unsigned char a, b, r;
1448   __vector __bool char c;
1449 
1450   a = (__vector unsigned char)vec_splats (__A);
1451   b = (__vector unsigned char)vec_splats (__B);
1452   c = (__vector __bool char)vec_cmpgt (a, b);
1453   r = vec_sel (b, a, c);
1454   return (__m64) ((__vector long long) r)[0];
1455 #else
1456   __m64_union m1, m2, res;
1457   long i;
1458 
1459   m1.as_m64 = __A;
1460   m2.as_m64 = __B;
1461 
1462 
1463   for (i = 0; i < 8; i++)
1464   res.as_char[i] =
1465       ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1466 	  m1.as_char[i] : m2.as_char[i];
1467 
1468   return (__m64) res.as_m64;
1469 #endif
1470 }
1471 
1472 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1473 _m_pmaxub (__m64 __A, __m64 __B)
1474 {
1475   return _mm_max_pu8 (__A, __B);
1476 }
1477 
1478 /* Compute the element-wise minimum of signed 16-bit values.  */
1479 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1480 _mm_min_pi16 (__m64 __A, __m64 __B)
1481 {
1482 #if _ARCH_PWR8
1483   __vector signed short a, b, r;
1484   __vector __bool short c;
1485 
1486   a = (__vector signed short)vec_splats (__A);
1487   b = (__vector signed short)vec_splats (__B);
1488   c = (__vector __bool short)vec_cmplt (a, b);
1489   r = vec_sel (b, a, c);
1490   return (__m64) ((__vector long long) r)[0];
1491 #else
1492   __m64_union m1, m2, res;
1493 
1494   m1.as_m64 = __A;
1495   m2.as_m64 = __B;
1496 
1497   res.as_short[0] =
1498       (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1499   res.as_short[1] =
1500       (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1501   res.as_short[2] =
1502       (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1503   res.as_short[3] =
1504       (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1505 
1506   return (__m64) res.as_m64;
1507 #endif
1508 }
1509 
1510 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1511 _m_pminsw (__m64 __A, __m64 __B)
1512 {
1513   return _mm_min_pi16 (__A, __B);
1514 }
1515 
1516 /* Compute the element-wise minimum of unsigned 8-bit values.  */
1517 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518 _mm_min_pu8 (__m64 __A, __m64 __B)
1519 {
1520 #if _ARCH_PWR8
1521   __vector unsigned char a, b, r;
1522   __vector __bool char c;
1523 
1524   a = (__vector unsigned char)vec_splats (__A);
1525   b = (__vector unsigned char)vec_splats (__B);
1526   c = (__vector __bool char)vec_cmplt (a, b);
1527   r = vec_sel (b, a, c);
1528   return (__m64) ((__vector long long) r)[0];
1529 #else
1530   __m64_union m1, m2, res;
1531   long i;
1532 
1533   m1.as_m64 = __A;
1534   m2.as_m64 = __B;
1535 
1536 
1537   for (i = 0; i < 8; i++)
1538   res.as_char[i] =
1539       ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1540 	  m1.as_char[i] : m2.as_char[i];
1541 
1542   return (__m64) res.as_m64;
1543 #endif
1544 }
1545 
1546 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547 _m_pminub (__m64 __A, __m64 __B)
1548 {
1549   return _mm_min_pu8 (__A, __B);
1550 }
1551 
1552 /* Create an 8-bit mask of the signs of 8-bit values.  */
1553 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1554 _mm_movemask_pi8 (__m64 __A)
1555 {
1556   unsigned long long p =
1557 #ifdef __LITTLE_ENDIAN__
1558                          0x0008101820283038UL; // permute control for sign bits
1559 #else
1560                          0x3830282018100800UL; // permute control for sign bits
1561 #endif
1562   return __builtin_bpermd (p, __A);
1563 }
1564 
1565 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1566 _m_pmovmskb (__m64 __A)
1567 {
1568   return _mm_movemask_pi8 (__A);
1569 }
1570 
1571 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1572    in B and produce the high 16 bits of the 32-bit results.  */
1573 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
1575 {
1576   __vector unsigned short a, b;
1577   __vector unsigned short c;
1578   __vector unsigned int w0, w1;
1579   __vector unsigned char xform1 = {
1580 #ifdef __LITTLE_ENDIAN__
1581       0x02, 0x03, 0x12, 0x13,  0x06, 0x07, 0x16, 0x17,
1582       0x0A, 0x0B, 0x1A, 0x1B,  0x0E, 0x0F, 0x1E, 0x1F
1583 #else
1584       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15,
1585       0x00, 0x01, 0x10, 0x11,  0x04, 0x05, 0x14, 0x15
1586 #endif
1587     };
1588 
1589   a = (__vector unsigned short)vec_splats (__A);
1590   b = (__vector unsigned short)vec_splats (__B);
1591 
1592   w0 = vec_vmuleuh (a, b);
1593   w1 = vec_vmulouh (a, b);
1594   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1595 
1596   return (__m64) ((__vector long long) c)[0];
1597 }
1598 
1599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600 _m_pmulhuw (__m64 __A, __m64 __B)
1601 {
1602   return _mm_mulhi_pu16 (__A, __B);
1603 }
1604 
1605 /* Return a combination of the four 16-bit values in A.  The selector
1606    must be an immediate.  */
1607 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608 _mm_shuffle_pi16 (__m64 __A, int const __N)
1609 {
1610   unsigned long element_selector_10 = __N & 0x03;
1611   unsigned long element_selector_32 = (__N >> 2) & 0x03;
1612   unsigned long element_selector_54 = (__N >> 4) & 0x03;
1613   unsigned long element_selector_76 = (__N >> 6) & 0x03;
1614   static const unsigned short permute_selectors[4] =
1615     {
1616 #ifdef __LITTLE_ENDIAN__
1617 	      0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1618 #else
1619 	      0x0607, 0x0405, 0x0203, 0x0001
1620 #endif
1621     };
1622   __m64_union t;
1623   __vector unsigned long long a, p, r;
1624 
1625 #ifdef __LITTLE_ENDIAN__
1626   t.as_short[0] = permute_selectors[element_selector_10];
1627   t.as_short[1] = permute_selectors[element_selector_32];
1628   t.as_short[2] = permute_selectors[element_selector_54];
1629   t.as_short[3] = permute_selectors[element_selector_76];
1630 #else
1631   t.as_short[3] = permute_selectors[element_selector_10];
1632   t.as_short[2] = permute_selectors[element_selector_32];
1633   t.as_short[1] = permute_selectors[element_selector_54];
1634   t.as_short[0] = permute_selectors[element_selector_76];
1635 #endif
1636   p = vec_splats (t.as_m64);
1637   a = vec_splats (__A);
1638   r = vec_perm (a, a, (__vector unsigned char)p);
1639   return (__m64) ((__vector long long) r)[0];
1640 }
1641 
1642 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1643 _m_pshufw (__m64 __A, int const __N)
1644 {
1645   return _mm_shuffle_pi16 (__A, __N);
1646 }
1647 
1648 /* Conditionally store byte elements of A into P.  The high bit of each
1649    byte in the selector N determines whether the corresponding byte from
1650    A is stored.  */
1651 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1653 {
1654   __m64 hibit = 0x8080808080808080UL;
1655   __m64 mask, tmp;
1656   __m64 *p = (__m64*)__P;
1657 
1658   tmp = *p;
1659   mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1660   tmp = (tmp & (~mask)) | (__A & mask);
1661   *p = tmp;
1662 }
1663 
1664 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1665 _m_maskmovq (__m64 __A, __m64 __N, char *__P)
1666 {
1667   _mm_maskmove_si64 (__A, __N, __P);
1668 }
1669 
1670 /* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
1671 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672 _mm_avg_pu8 (__m64 __A, __m64 __B)
1673 {
1674   __vector unsigned char a, b, c;
1675 
1676   a = (__vector unsigned char)vec_splats (__A);
1677   b = (__vector unsigned char)vec_splats (__B);
1678   c = vec_avg (a, b);
1679   return (__m64) ((__vector long long) c)[0];
1680 }
1681 
1682 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683 _m_pavgb (__m64 __A, __m64 __B)
1684 {
1685   return _mm_avg_pu8 (__A, __B);
1686 }
1687 
1688 /* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
1689 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1690 _mm_avg_pu16 (__m64 __A, __m64 __B)
1691 {
1692   __vector unsigned short a, b, c;
1693 
1694   a = (__vector unsigned short)vec_splats (__A);
1695   b = (__vector unsigned short)vec_splats (__B);
1696   c = vec_avg (a, b);
1697   return (__m64) ((__vector long long) c)[0];
1698 }
1699 
1700 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701 _m_pavgw (__m64 __A, __m64 __B)
1702 {
1703   return _mm_avg_pu16 (__A, __B);
1704 }
1705 
1706 /* Compute the sum of the absolute differences of the unsigned 8-bit
1707    values in A and B.  Return the value in the lower 16-bit word; the
1708    upper words are cleared.  */
1709 extern __inline    __m64    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1710 _mm_sad_pu8 (__m64  __A, __m64  __B)
1711 {
1712   __vector unsigned char a, b;
1713   __vector unsigned char vmin, vmax, vabsdiff;
1714   __vector signed int vsum;
1715   const __vector unsigned int zero =
1716     { 0, 0, 0, 0 };
1717   __m64_union result = {0};
1718 
1719   a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1720   b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1721   vmin = vec_min (a, b);
1722   vmax = vec_max (a, b);
1723   vabsdiff = vec_sub (vmax, vmin);
1724   /* Sum four groups of bytes into integers.  */
1725   vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1726   /* Sum across four integers with integer result.  */
1727   vsum = vec_sums (vsum, (__vector signed int) zero);
1728   /* The sum is in the right most 32-bits of the vector result.
1729      Transfer to a GPR and truncate to 16 bits.  */
1730   result.as_short[0] = vsum[3];
1731   return result.as_m64;
1732 }
1733 
1734 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1735 _m_psadbw (__m64 __A, __m64 __B)
1736 {
1737   return _mm_sad_pu8 (__A, __B);
1738 }
1739 
1740 /* Stores the data in A to the address P without polluting the caches.  */
1741 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742 _mm_stream_pi (__m64 *__P, __m64 __A)
1743 {
1744   /* Use the data cache block touch for store transient.  */
1745   __asm__ (
1746     "	dcbtstt	0,%0"
1747     :
1748     : "b" (__P)
1749     : "memory"
1750   );
1751   *__P = __A;
1752 }
1753 
1754 /* Likewise.  The address must be 16-byte aligned.  */
1755 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756 _mm_stream_ps (float *__P, __m128 __A)
1757 {
1758   /* Use the data cache block touch for store transient.  */
1759   __asm__ (
1760     "	dcbtstt	0,%0"
1761     :
1762     : "b" (__P)
1763     : "memory"
1764   );
1765   _mm_store_ps (__P, __A);
1766 }
1767 
1768 /* Guarantees that every preceding store is globally visible before
1769    any subsequent store.  */
1770 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1771 _mm_sfence (void)
1772 {
1773   /* Generate a light weight sync.  */
1774   __atomic_thread_fence (__ATOMIC_RELEASE);
1775 }
1776 
1777 /* The execution of the next instruction is delayed by an implementation
1778    specific amount of time.  The instruction does not modify the
1779    architectural state.  This is after the pop_options pragma because
1780    it does not require SSE support in the processor--the encoding is a
1781    nop on processors that do not support it.  */
1782 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783 _mm_pause (void)
1784 {
1785   /* There is no exact match with this construct, but the following is
1786      close to the desired effect.  */
1787 #if _ARCH_PWR8
1788   /* On power8 and later processors we can depend on Program Priority
1789      (PRI) and associated "very low" PPI setting.  Since we don't know
1790      what PPI this thread is running at we: 1) save the current PRI
1791      from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1792      via the special or 31,31,31 encoding. 3) issue an "isync" to
1793      insure the PRI change takes effect before we execute any more
1794      instructions.
1795      Now we can execute a lwsync (release barrier) while we execute
1796      this thread at "very low" PRI.  Finally we restore the original
1797      PRI and continue execution.  */
1798   unsigned long __PPR;
1799 
1800   __asm__ volatile (
1801     "	mfppr	%0;"
1802     "   or 31,31,31;"
1803     "   isync;"
1804     "   lwsync;"
1805     "   isync;"
1806     "   mtppr	%0;"
1807     : "=r" (__PPR)
1808     :
1809     : "memory"
1810   );
1811 #else
1812   /* For older processor where we may not even have Program Priority
1813      controls we can only depend on Heavy Weight Sync.  */
1814   __atomic_thread_fence (__ATOMIC_SEQ_CST);
1815 #endif
1816 }
1817 
1818 /* Transpose the 4x4 matrix composed of row[0-3].  */
1819 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)			\
1820 do {									\
1821   __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);	\
1822   __v4sf __t0 = vec_vmrghw (__r0, __r1);			\
1823   __v4sf __t1 = vec_vmrghw (__r2, __r3);			\
1824   __v4sf __t2 = vec_vmrglw (__r0, __r1);			\
1825   __v4sf __t3 = vec_vmrglw (__r2, __r3);			\
1826   (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, 	\
1827 			       (__vector long long)__t1);	\
1828   (row1) = (__v4sf)vec_mergel ((__vector long long)__t0,	\
1829 			       (__vector long long)__t1);	\
1830   (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2,	\
1831 			       (__vector long long)__t3);	\
1832   (row3) = (__v4sf)vec_mergel ((__vector long long)__t2,	\
1833 			       (__vector long long)__t3);	\
1834 } while (0)
1835 
1836 /* For backward source compatibility.  */
1837 //# include <emmintrin.h>
1838 
1839 #else
1840 #include_next <xmmintrin.h>
1841 #endif /* defined(__linux__) && defined(__ppc64__) */
1842 
1843 #endif /* _XMMINTRIN_H_INCLUDED */
1844