xref: /freebsd/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/mmintrin.h (revision 04eeddc0aa8e0a417a16eaf9d7d095207f4a8623)
1 /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header file is to help porting code using Intel intrinsics
15    explicitly from x86_64 to powerpc64/powerpc64le.
16 
17    Since PowerPC target doesn't support native 64-bit vector type, we
18    typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19    works well for _si64 and some _pi32 operations.
20 
21    For _pi16 and _pi8 operations, it's better to transfer __m64 into
22    128-bit PowerPC vector first. Power8 introduced direct register
23    move instructions which helps for more efficient implementation.
24 
25    It's user's responsibility to determine if the results of such port
26    are acceptable or further changes are needed. Please note that much
27    code using Intel intrinsics CAN BE REWRITTEN in more portable and
28    efficient standard C or GNU C extensions with 64-bit scalar
29    operations, or 128-bit SSE/Altivec operations, which are more
30    recommended. */
31 #error                                                                         \
32     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33 #endif
34 
35 #ifndef _MMINTRIN_H_INCLUDED
36 #define _MMINTRIN_H_INCLUDED
37 
38 #if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
39 
40 #include <altivec.h>
41 /* The Intel API is flexible enough that we must allow aliasing with other
42    vector types, and their scalar components.  */
43 typedef __attribute__((__aligned__(8))) unsigned long long __m64;
44 
45 typedef __attribute__((__aligned__(8))) union {
46   __m64 as_m64;
47   char as_char[8];
48   signed char as_signed_char[8];
49   short as_short[4];
50   int as_int[2];
51   long long as_long_long;
52   float as_float[2];
53   double as_double;
54 } __m64_union;
55 
56 /* Empty the multimedia state.  */
57 extern __inline void
58     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59     _mm_empty(void) {
60   /* nothing to do on PowerPC.  */
61 }
62 
63 extern __inline void
64     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65     _m_empty(void) {
66   /* nothing to do on PowerPC.  */
67 }
68 
69 /* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
70 extern __inline __m64
71     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72     _mm_cvtsi32_si64(int __i) {
73   return (__m64)(unsigned int)__i;
74 }
75 
76 extern __inline __m64
77     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78     _m_from_int(int __i) {
79   return _mm_cvtsi32_si64(__i);
80 }
81 
82 /* Convert the lower 32 bits of the __m64 object into an integer.  */
83 extern __inline int
84     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85     _mm_cvtsi64_si32(__m64 __i) {
86   return ((int)__i);
87 }
88 
89 extern __inline int
90     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91     _m_to_int(__m64 __i) {
92   return _mm_cvtsi64_si32(__i);
93 }
94 
95 /* Convert I to a __m64 object.  */
96 
97 /* Intel intrinsic.  */
98 extern __inline __m64
99     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100     _m_from_int64(long long __i) {
101   return (__m64)__i;
102 }
103 
104 extern __inline __m64
105     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106     _mm_cvtsi64_m64(long long __i) {
107   return (__m64)__i;
108 }
109 
110 /* Microsoft intrinsic.  */
111 extern __inline __m64
112     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113     _mm_cvtsi64x_si64(long long __i) {
114   return (__m64)__i;
115 }
116 
117 extern __inline __m64
118     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119     _mm_set_pi64x(long long __i) {
120   return (__m64)__i;
121 }
122 
123 /* Convert the __m64 object to a 64bit integer.  */
124 
125 /* Intel intrinsic.  */
126 extern __inline long long
127     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128     _m_to_int64(__m64 __i) {
129   return (long long)__i;
130 }
131 
132 extern __inline long long
133     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134     _mm_cvtm64_si64(__m64 __i) {
135   return (long long)__i;
136 }
137 
138 /* Microsoft intrinsic.  */
139 extern __inline long long
140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141     _mm_cvtsi64_si64x(__m64 __i) {
142   return (long long)__i;
143 }
144 
145 #ifdef _ARCH_PWR8
146 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
147    the result, and the four 16-bit values from M2 into the upper four 8-bit
148    values of the result, all with signed saturation.  */
149 extern __inline __m64
150     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151     _mm_packs_pi16(__m64 __m1, __m64 __m2) {
152   __vector signed short vm1;
153   __vector signed char vresult;
154 
155   vm1 = (__vector signed short)(__vector unsigned long long)
156 #ifdef __LITTLE_ENDIAN__
157       {__m1, __m2};
158 #else
159       {__m2, __m1};
160 #endif
161   vresult = vec_packs(vm1, vm1);
162   return (__m64)((__vector long long)vresult)[0];
163 }
164 
165 extern __inline __m64
166     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167     _m_packsswb(__m64 __m1, __m64 __m2) {
168   return _mm_packs_pi16(__m1, __m2);
169 }
170 
171 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
172    the result, and the two 32-bit values from M2 into the upper two 16-bit
173    values of the result, all with signed saturation.  */
174 extern __inline __m64
175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176     _mm_packs_pi32(__m64 __m1, __m64 __m2) {
177   __vector signed int vm1;
178   __vector signed short vresult;
179 
180   vm1 = (__vector signed int)(__vector unsigned long long)
181 #ifdef __LITTLE_ENDIAN__
182       {__m1, __m2};
183 #else
184       {__m2, __m1};
185 #endif
186   vresult = vec_packs(vm1, vm1);
187   return (__m64)((__vector long long)vresult)[0];
188 }
189 
190 extern __inline __m64
191     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192     _m_packssdw(__m64 __m1, __m64 __m2) {
193   return _mm_packs_pi32(__m1, __m2);
194 }
195 
196 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of
197    the result, and the four 16-bit values from M2 into the upper four 8-bit
198    values of the result, all with unsigned saturation.  */
199 extern __inline __m64
200     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201     _mm_packs_pu16(__m64 __m1, __m64 __m2) {
202   __vector unsigned char r;
203   __vector signed short vm1 = (__vector signed short)(__vector long long)
204 #ifdef __LITTLE_ENDIAN__
205       {__m1, __m2};
206 #else
207       {__m2, __m1};
208 #endif
209   const __vector signed short __zero = {0};
210   __vector __bool short __select = vec_cmplt(vm1, __zero);
211   r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
212   __vector __bool char packsel = vec_pack(__select, __select);
213   r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
214   return (__m64)((__vector long long)r)[0];
215 }
216 
217 extern __inline __m64
218     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219     _m_packuswb(__m64 __m1, __m64 __m2) {
220   return _mm_packs_pu16(__m1, __m2);
221 }
222 #endif /* end ARCH_PWR8 */
223 
224 /* Interleave the four 8-bit values from the high half of M1 with the four
225    8-bit values from the high half of M2.  */
226 extern __inline __m64
227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228     _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
229 #if _ARCH_PWR8
230   __vector unsigned char a, b, c;
231 
232   a = (__vector unsigned char)vec_splats(__m1);
233   b = (__vector unsigned char)vec_splats(__m2);
234   c = vec_mergel(a, b);
235   return (__m64)((__vector long long)c)[1];
236 #else
237   __m64_union m1, m2, res;
238 
239   m1.as_m64 = __m1;
240   m2.as_m64 = __m2;
241 
242   res.as_char[0] = m1.as_char[4];
243   res.as_char[1] = m2.as_char[4];
244   res.as_char[2] = m1.as_char[5];
245   res.as_char[3] = m2.as_char[5];
246   res.as_char[4] = m1.as_char[6];
247   res.as_char[5] = m2.as_char[6];
248   res.as_char[6] = m1.as_char[7];
249   res.as_char[7] = m2.as_char[7];
250 
251   return (__m64)res.as_m64;
252 #endif
253 }
254 
255 extern __inline __m64
256     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257     _m_punpckhbw(__m64 __m1, __m64 __m2) {
258   return _mm_unpackhi_pi8(__m1, __m2);
259 }
260 
261 /* Interleave the two 16-bit values from the high half of M1 with the two
262    16-bit values from the high half of M2.  */
263 extern __inline __m64
264     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265     _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
266   __m64_union m1, m2, res;
267 
268   m1.as_m64 = __m1;
269   m2.as_m64 = __m2;
270 
271   res.as_short[0] = m1.as_short[2];
272   res.as_short[1] = m2.as_short[2];
273   res.as_short[2] = m1.as_short[3];
274   res.as_short[3] = m2.as_short[3];
275 
276   return (__m64)res.as_m64;
277 }
278 
279 extern __inline __m64
280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281     _m_punpckhwd(__m64 __m1, __m64 __m2) {
282   return _mm_unpackhi_pi16(__m1, __m2);
283 }
284 /* Interleave the 32-bit value from the high half of M1 with the 32-bit
285    value from the high half of M2.  */
286 extern __inline __m64
287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288     _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
289   __m64_union m1, m2, res;
290 
291   m1.as_m64 = __m1;
292   m2.as_m64 = __m2;
293 
294   res.as_int[0] = m1.as_int[1];
295   res.as_int[1] = m2.as_int[1];
296 
297   return (__m64)res.as_m64;
298 }
299 
300 extern __inline __m64
301     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302     _m_punpckhdq(__m64 __m1, __m64 __m2) {
303   return _mm_unpackhi_pi32(__m1, __m2);
304 }
305 /* Interleave the four 8-bit values from the low half of M1 with the four
306    8-bit values from the low half of M2.  */
307 extern __inline __m64
308     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309     _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
310 #if _ARCH_PWR8
311   __vector unsigned char a, b, c;
312 
313   a = (__vector unsigned char)vec_splats(__m1);
314   b = (__vector unsigned char)vec_splats(__m2);
315   c = vec_mergel(a, b);
316   return (__m64)((__vector long long)c)[0];
317 #else
318   __m64_union m1, m2, res;
319 
320   m1.as_m64 = __m1;
321   m2.as_m64 = __m2;
322 
323   res.as_char[0] = m1.as_char[0];
324   res.as_char[1] = m2.as_char[0];
325   res.as_char[2] = m1.as_char[1];
326   res.as_char[3] = m2.as_char[1];
327   res.as_char[4] = m1.as_char[2];
328   res.as_char[5] = m2.as_char[2];
329   res.as_char[6] = m1.as_char[3];
330   res.as_char[7] = m2.as_char[3];
331 
332   return (__m64)res.as_m64;
333 #endif
334 }
335 
336 extern __inline __m64
337     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338     _m_punpcklbw(__m64 __m1, __m64 __m2) {
339   return _mm_unpacklo_pi8(__m1, __m2);
340 }
341 /* Interleave the two 16-bit values from the low half of M1 with the two
342    16-bit values from the low half of M2.  */
343 extern __inline __m64
344     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345     _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
346   __m64_union m1, m2, res;
347 
348   m1.as_m64 = __m1;
349   m2.as_m64 = __m2;
350 
351   res.as_short[0] = m1.as_short[0];
352   res.as_short[1] = m2.as_short[0];
353   res.as_short[2] = m1.as_short[1];
354   res.as_short[3] = m2.as_short[1];
355 
356   return (__m64)res.as_m64;
357 }
358 
359 extern __inline __m64
360     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361     _m_punpcklwd(__m64 __m1, __m64 __m2) {
362   return _mm_unpacklo_pi16(__m1, __m2);
363 }
364 
365 /* Interleave the 32-bit value from the low half of M1 with the 32-bit
366    value from the low half of M2.  */
367 extern __inline __m64
368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369     _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
370   __m64_union m1, m2, res;
371 
372   m1.as_m64 = __m1;
373   m2.as_m64 = __m2;
374 
375   res.as_int[0] = m1.as_int[0];
376   res.as_int[1] = m2.as_int[0];
377 
378   return (__m64)res.as_m64;
379 }
380 
381 extern __inline __m64
382     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383     _m_punpckldq(__m64 __m1, __m64 __m2) {
384   return _mm_unpacklo_pi32(__m1, __m2);
385 }
386 
387 /* Add the 8-bit values in M1 to the 8-bit values in M2.  */
388 extern __inline __m64
389     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390     _mm_add_pi8(__m64 __m1, __m64 __m2) {
391 #if _ARCH_PWR8
392   __vector signed char a, b, c;
393 
394   a = (__vector signed char)vec_splats(__m1);
395   b = (__vector signed char)vec_splats(__m2);
396   c = vec_add(a, b);
397   return (__m64)((__vector long long)c)[0];
398 #else
399   __m64_union m1, m2, res;
400 
401   m1.as_m64 = __m1;
402   m2.as_m64 = __m2;
403 
404   res.as_char[0] = m1.as_char[0] + m2.as_char[0];
405   res.as_char[1] = m1.as_char[1] + m2.as_char[1];
406   res.as_char[2] = m1.as_char[2] + m2.as_char[2];
407   res.as_char[3] = m1.as_char[3] + m2.as_char[3];
408   res.as_char[4] = m1.as_char[4] + m2.as_char[4];
409   res.as_char[5] = m1.as_char[5] + m2.as_char[5];
410   res.as_char[6] = m1.as_char[6] + m2.as_char[6];
411   res.as_char[7] = m1.as_char[7] + m2.as_char[7];
412 
413   return (__m64)res.as_m64;
414 #endif
415 }
416 
417 extern __inline __m64
418     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419     _m_paddb(__m64 __m1, __m64 __m2) {
420   return _mm_add_pi8(__m1, __m2);
421 }
422 
423 /* Add the 16-bit values in M1 to the 16-bit values in M2.  */
424 extern __inline __m64
425     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426     _mm_add_pi16(__m64 __m1, __m64 __m2) {
427 #if _ARCH_PWR8
428   __vector signed short a, b, c;
429 
430   a = (__vector signed short)vec_splats(__m1);
431   b = (__vector signed short)vec_splats(__m2);
432   c = vec_add(a, b);
433   return (__m64)((__vector long long)c)[0];
434 #else
435   __m64_union m1, m2, res;
436 
437   m1.as_m64 = __m1;
438   m2.as_m64 = __m2;
439 
440   res.as_short[0] = m1.as_short[0] + m2.as_short[0];
441   res.as_short[1] = m1.as_short[1] + m2.as_short[1];
442   res.as_short[2] = m1.as_short[2] + m2.as_short[2];
443   res.as_short[3] = m1.as_short[3] + m2.as_short[3];
444 
445   return (__m64)res.as_m64;
446 #endif
447 }
448 
449 extern __inline __m64
450     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451     _m_paddw(__m64 __m1, __m64 __m2) {
452   return _mm_add_pi16(__m1, __m2);
453 }
454 
455 /* Add the 32-bit values in M1 to the 32-bit values in M2.  */
456 extern __inline __m64
457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458     _mm_add_pi32(__m64 __m1, __m64 __m2) {
459 #if _ARCH_PWR9
460   __vector signed int a, b, c;
461 
462   a = (__vector signed int)vec_splats(__m1);
463   b = (__vector signed int)vec_splats(__m2);
464   c = vec_add(a, b);
465   return (__m64)((__vector long long)c)[0];
466 #else
467   __m64_union m1, m2, res;
468 
469   m1.as_m64 = __m1;
470   m2.as_m64 = __m2;
471 
472   res.as_int[0] = m1.as_int[0] + m2.as_int[0];
473   res.as_int[1] = m1.as_int[1] + m2.as_int[1];
474 
475   return (__m64)res.as_m64;
476 #endif
477 }
478 
479 extern __inline __m64
480     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481     _m_paddd(__m64 __m1, __m64 __m2) {
482   return _mm_add_pi32(__m1, __m2);
483 }
484 
485 /* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
486 extern __inline __m64
487     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488     _mm_sub_pi8(__m64 __m1, __m64 __m2) {
489 #if _ARCH_PWR8
490   __vector signed char a, b, c;
491 
492   a = (__vector signed char)vec_splats(__m1);
493   b = (__vector signed char)vec_splats(__m2);
494   c = vec_sub(a, b);
495   return (__m64)((__vector long long)c)[0];
496 #else
497   __m64_union m1, m2, res;
498 
499   m1.as_m64 = __m1;
500   m2.as_m64 = __m2;
501 
502   res.as_char[0] = m1.as_char[0] - m2.as_char[0];
503   res.as_char[1] = m1.as_char[1] - m2.as_char[1];
504   res.as_char[2] = m1.as_char[2] - m2.as_char[2];
505   res.as_char[3] = m1.as_char[3] - m2.as_char[3];
506   res.as_char[4] = m1.as_char[4] - m2.as_char[4];
507   res.as_char[5] = m1.as_char[5] - m2.as_char[5];
508   res.as_char[6] = m1.as_char[6] - m2.as_char[6];
509   res.as_char[7] = m1.as_char[7] - m2.as_char[7];
510 
511   return (__m64)res.as_m64;
512 #endif
513 }
514 
515 extern __inline __m64
516     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517     _m_psubb(__m64 __m1, __m64 __m2) {
518   return _mm_sub_pi8(__m1, __m2);
519 }
520 
521 /* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
522 extern __inline __m64
523     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524     _mm_sub_pi16(__m64 __m1, __m64 __m2) {
525 #if _ARCH_PWR8
526   __vector signed short a, b, c;
527 
528   a = (__vector signed short)vec_splats(__m1);
529   b = (__vector signed short)vec_splats(__m2);
530   c = vec_sub(a, b);
531   return (__m64)((__vector long long)c)[0];
532 #else
533   __m64_union m1, m2, res;
534 
535   m1.as_m64 = __m1;
536   m2.as_m64 = __m2;
537 
538   res.as_short[0] = m1.as_short[0] - m2.as_short[0];
539   res.as_short[1] = m1.as_short[1] - m2.as_short[1];
540   res.as_short[2] = m1.as_short[2] - m2.as_short[2];
541   res.as_short[3] = m1.as_short[3] - m2.as_short[3];
542 
543   return (__m64)res.as_m64;
544 #endif
545 }
546 
547 extern __inline __m64
548     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549     _m_psubw(__m64 __m1, __m64 __m2) {
550   return _mm_sub_pi16(__m1, __m2);
551 }
552 
553 /* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
554 extern __inline __m64
555     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556     _mm_sub_pi32(__m64 __m1, __m64 __m2) {
557 #if _ARCH_PWR9
558   __vector signed int a, b, c;
559 
560   a = (__vector signed int)vec_splats(__m1);
561   b = (__vector signed int)vec_splats(__m2);
562   c = vec_sub(a, b);
563   return (__m64)((__vector long long)c)[0];
564 #else
565   __m64_union m1, m2, res;
566 
567   m1.as_m64 = __m1;
568   m2.as_m64 = __m2;
569 
570   res.as_int[0] = m1.as_int[0] - m2.as_int[0];
571   res.as_int[1] = m1.as_int[1] - m2.as_int[1];
572 
573   return (__m64)res.as_m64;
574 #endif
575 }
576 
577 extern __inline __m64
578     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579     _m_psubd(__m64 __m1, __m64 __m2) {
580   return _mm_sub_pi32(__m1, __m2);
581 }
582 
583 extern __inline __m64
584     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585     _mm_add_si64(__m64 __m1, __m64 __m2) {
586   return (__m1 + __m2);
587 }
588 
589 extern __inline __m64
590     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591     _mm_sub_si64(__m64 __m1, __m64 __m2) {
592   return (__m1 - __m2);
593 }
594 
595 /* Shift the 64-bit value in M left by COUNT.  */
596 extern __inline __m64
597     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598     _mm_sll_si64(__m64 __m, __m64 __count) {
599   return (__m << __count);
600 }
601 
602 extern __inline __m64
603     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604     _m_psllq(__m64 __m, __m64 __count) {
605   return _mm_sll_si64(__m, __count);
606 }
607 
608 extern __inline __m64
609     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610     _mm_slli_si64(__m64 __m, const int __count) {
611   return (__m << __count);
612 }
613 
614 extern __inline __m64
615     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616     _m_psllqi(__m64 __m, const int __count) {
617   return _mm_slli_si64(__m, __count);
618 }
619 
620 /* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
621 extern __inline __m64
622     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623     _mm_srl_si64(__m64 __m, __m64 __count) {
624   return (__m >> __count);
625 }
626 
627 extern __inline __m64
628     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629     _m_psrlq(__m64 __m, __m64 __count) {
630   return _mm_srl_si64(__m, __count);
631 }
632 
633 extern __inline __m64
634     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635     _mm_srli_si64(__m64 __m, const int __count) {
636   return (__m >> __count);
637 }
638 
639 extern __inline __m64
640     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641     _m_psrlqi(__m64 __m, const int __count) {
642   return _mm_srli_si64(__m, __count);
643 }
644 
645 /* Bit-wise AND the 64-bit values in M1 and M2.  */
646 extern __inline __m64
647     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648     _mm_and_si64(__m64 __m1, __m64 __m2) {
649   return (__m1 & __m2);
650 }
651 
652 extern __inline __m64
653     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654     _m_pand(__m64 __m1, __m64 __m2) {
655   return _mm_and_si64(__m1, __m2);
656 }
657 
658 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
659    64-bit value in M2.  */
660 extern __inline __m64
661     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662     _mm_andnot_si64(__m64 __m1, __m64 __m2) {
663   return (~__m1 & __m2);
664 }
665 
666 extern __inline __m64
667     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668     _m_pandn(__m64 __m1, __m64 __m2) {
669   return _mm_andnot_si64(__m1, __m2);
670 }
671 
672 /* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
673 extern __inline __m64
674     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
675     _mm_or_si64(__m64 __m1, __m64 __m2) {
676   return (__m1 | __m2);
677 }
678 
679 extern __inline __m64
680     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681     _m_por(__m64 __m1, __m64 __m2) {
682   return _mm_or_si64(__m1, __m2);
683 }
684 
685 /* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
686 extern __inline __m64
687     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688     _mm_xor_si64(__m64 __m1, __m64 __m2) {
689   return (__m1 ^ __m2);
690 }
691 
692 extern __inline __m64
693     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694     _m_pxor(__m64 __m1, __m64 __m2) {
695   return _mm_xor_si64(__m1, __m2);
696 }
697 
698 /* Creates a 64-bit zero.  */
699 extern __inline __m64
700     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701     _mm_setzero_si64(void) {
702   return (__m64)0;
703 }
704 
705 /* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
706    test is true and zero if false.  */
707 extern __inline __m64
708     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709     _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
710 #if defined(_ARCH_PWR6) && defined(__powerpc64__)
711   __m64 res;
712   __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
713   return (res);
714 #else
715   __m64_union m1, m2, res;
716 
717   m1.as_m64 = __m1;
718   m2.as_m64 = __m2;
719 
720   res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
721   res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
722   res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
723   res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
724   res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
725   res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
726   res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
727   res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
728 
729   return (__m64)res.as_m64;
730 #endif
731 }
732 
733 extern __inline __m64
734     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735     _m_pcmpeqb(__m64 __m1, __m64 __m2) {
736   return _mm_cmpeq_pi8(__m1, __m2);
737 }
738 
739 extern __inline __m64
740     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741     _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
742 #if _ARCH_PWR8
743   __vector signed char a, b, c;
744 
745   a = (__vector signed char)vec_splats(__m1);
746   b = (__vector signed char)vec_splats(__m2);
747   c = (__vector signed char)vec_cmpgt(a, b);
748   return (__m64)((__vector long long)c)[0];
749 #else
750   __m64_union m1, m2, res;
751 
752   m1.as_m64 = __m1;
753   m2.as_m64 = __m2;
754 
755   res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
756   res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
757   res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
758   res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
759   res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
760   res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
761   res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
762   res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
763 
764   return (__m64)res.as_m64;
765 #endif
766 }
767 
768 extern __inline __m64
769     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770     _m_pcmpgtb(__m64 __m1, __m64 __m2) {
771   return _mm_cmpgt_pi8(__m1, __m2);
772 }
773 
774 /* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
775    the test is true and zero if false.  */
776 extern __inline __m64
777     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778     _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
779 #if _ARCH_PWR8
780   __vector signed short a, b, c;
781 
782   a = (__vector signed short)vec_splats(__m1);
783   b = (__vector signed short)vec_splats(__m2);
784   c = (__vector signed short)vec_cmpeq(a, b);
785   return (__m64)((__vector long long)c)[0];
786 #else
787   __m64_union m1, m2, res;
788 
789   m1.as_m64 = __m1;
790   m2.as_m64 = __m2;
791 
792   res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
793   res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
794   res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
795   res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
796 
797   return (__m64)res.as_m64;
798 #endif
799 }
800 
801 extern __inline __m64
802     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803     _m_pcmpeqw(__m64 __m1, __m64 __m2) {
804   return _mm_cmpeq_pi16(__m1, __m2);
805 }
806 
807 extern __inline __m64
808     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809     _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
810 #if _ARCH_PWR8
811   __vector signed short a, b, c;
812 
813   a = (__vector signed short)vec_splats(__m1);
814   b = (__vector signed short)vec_splats(__m2);
815   c = (__vector signed short)vec_cmpgt(a, b);
816   return (__m64)((__vector long long)c)[0];
817 #else
818   __m64_union m1, m2, res;
819 
820   m1.as_m64 = __m1;
821   m2.as_m64 = __m2;
822 
823   res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
824   res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
825   res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
826   res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
827 
828   return (__m64)res.as_m64;
829 #endif
830 }
831 
832 extern __inline __m64
833     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834     _m_pcmpgtw(__m64 __m1, __m64 __m2) {
835   return _mm_cmpgt_pi16(__m1, __m2);
836 }
837 
838 /* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
839    the test is true and zero if false.  */
840 extern __inline __m64
841     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842     _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
843 #if _ARCH_PWR9
844   __vector signed int a, b, c;
845 
846   a = (__vector signed int)vec_splats(__m1);
847   b = (__vector signed int)vec_splats(__m2);
848   c = (__vector signed int)vec_cmpeq(a, b);
849   return (__m64)((__vector long long)c)[0];
850 #else
851   __m64_union m1, m2, res;
852 
853   m1.as_m64 = __m1;
854   m2.as_m64 = __m2;
855 
856   res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
857   res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
858 
859   return (__m64)res.as_m64;
860 #endif
861 }
862 
863 extern __inline __m64
864     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865     _m_pcmpeqd(__m64 __m1, __m64 __m2) {
866   return _mm_cmpeq_pi32(__m1, __m2);
867 }
868 
869 extern __inline __m64
870     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871     _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
872 #if _ARCH_PWR9
873   __vector signed int a, b, c;
874 
875   a = (__vector signed int)vec_splats(__m1);
876   b = (__vector signed int)vec_splats(__m2);
877   c = (__vector signed int)vec_cmpgt(a, b);
878   return (__m64)((__vector long long)c)[0];
879 #else
880   __m64_union m1, m2, res;
881 
882   m1.as_m64 = __m1;
883   m2.as_m64 = __m2;
884 
885   res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
886   res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
887 
888   return (__m64)res.as_m64;
889 #endif
890 }
891 
892 extern __inline __m64
893     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894     _m_pcmpgtd(__m64 __m1, __m64 __m2) {
895   return _mm_cmpgt_pi32(__m1, __m2);
896 }
897 
898 #if _ARCH_PWR8
899 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
900    saturated arithmetic.  */
901 extern __inline __m64
902     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903     _mm_adds_pi8(__m64 __m1, __m64 __m2) {
904   __vector signed char a, b, c;
905 
906   a = (__vector signed char)vec_splats(__m1);
907   b = (__vector signed char)vec_splats(__m2);
908   c = vec_adds(a, b);
909   return (__m64)((__vector long long)c)[0];
910 }
911 
912 extern __inline __m64
913     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914     _m_paddsb(__m64 __m1, __m64 __m2) {
915   return _mm_adds_pi8(__m1, __m2);
916 }
917 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
918    saturated arithmetic.  */
919 extern __inline __m64
920     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921     _mm_adds_pi16(__m64 __m1, __m64 __m2) {
922   __vector signed short a, b, c;
923 
924   a = (__vector signed short)vec_splats(__m1);
925   b = (__vector signed short)vec_splats(__m2);
926   c = vec_adds(a, b);
927   return (__m64)((__vector long long)c)[0];
928 }
929 
930 extern __inline __m64
931     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932     _m_paddsw(__m64 __m1, __m64 __m2) {
933   return _mm_adds_pi16(__m1, __m2);
934 }
935 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
936    saturated arithmetic.  */
937 extern __inline __m64
938     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939     _mm_adds_pu8(__m64 __m1, __m64 __m2) {
940   __vector unsigned char a, b, c;
941 
942   a = (__vector unsigned char)vec_splats(__m1);
943   b = (__vector unsigned char)vec_splats(__m2);
944   c = vec_adds(a, b);
945   return (__m64)((__vector long long)c)[0];
946 }
947 
948 extern __inline __m64
949     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950     _m_paddusb(__m64 __m1, __m64 __m2) {
951   return _mm_adds_pu8(__m1, __m2);
952 }
953 
954 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
955    saturated arithmetic.  */
956 extern __inline __m64
957     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958     _mm_adds_pu16(__m64 __m1, __m64 __m2) {
959   __vector unsigned short a, b, c;
960 
961   a = (__vector unsigned short)vec_splats(__m1);
962   b = (__vector unsigned short)vec_splats(__m2);
963   c = vec_adds(a, b);
964   return (__m64)((__vector long long)c)[0];
965 }
966 
967 extern __inline __m64
968     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969     _m_paddusw(__m64 __m1, __m64 __m2) {
970   return _mm_adds_pu16(__m1, __m2);
971 }
972 
973 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
974    saturating arithmetic.  */
975 extern __inline __m64
976     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977     _mm_subs_pi8(__m64 __m1, __m64 __m2) {
978   __vector signed char a, b, c;
979 
980   a = (__vector signed char)vec_splats(__m1);
981   b = (__vector signed char)vec_splats(__m2);
982   c = vec_subs(a, b);
983   return (__m64)((__vector long long)c)[0];
984 }
985 
986 extern __inline __m64
987     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988     _m_psubsb(__m64 __m1, __m64 __m2) {
989   return _mm_subs_pi8(__m1, __m2);
990 }
991 
992 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
993    signed saturating arithmetic.  */
994 extern __inline __m64
995     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996     _mm_subs_pi16(__m64 __m1, __m64 __m2) {
997   __vector signed short a, b, c;
998 
999   a = (__vector signed short)vec_splats(__m1);
1000   b = (__vector signed short)vec_splats(__m2);
1001   c = vec_subs(a, b);
1002   return (__m64)((__vector long long)c)[0];
1003 }
1004 
1005 extern __inline __m64
1006     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007     _m_psubsw(__m64 __m1, __m64 __m2) {
1008   return _mm_subs_pi16(__m1, __m2);
1009 }
1010 
1011 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1012    unsigned saturating arithmetic.  */
1013 extern __inline __m64
1014     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015     _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1016   __vector unsigned char a, b, c;
1017 
1018   a = (__vector unsigned char)vec_splats(__m1);
1019   b = (__vector unsigned char)vec_splats(__m2);
1020   c = vec_subs(a, b);
1021   return (__m64)((__vector long long)c)[0];
1022 }
1023 
1024 extern __inline __m64
1025     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026     _m_psubusb(__m64 __m1, __m64 __m2) {
1027   return _mm_subs_pu8(__m1, __m2);
1028 }
1029 
1030 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1031    unsigned saturating arithmetic.  */
1032 extern __inline __m64
1033     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034     _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1035   __vector unsigned short a, b, c;
1036 
1037   a = (__vector unsigned short)vec_splats(__m1);
1038   b = (__vector unsigned short)vec_splats(__m2);
1039   c = vec_subs(a, b);
1040   return (__m64)((__vector long long)c)[0];
1041 }
1042 
1043 extern __inline __m64
1044     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045     _m_psubusw(__m64 __m1, __m64 __m2) {
1046   return _mm_subs_pu16(__m1, __m2);
1047 }
1048 
1049 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1050    four 32-bit intermediate results, which are then summed by pairs to
1051    produce two 32-bit results.  */
1052 extern __inline __m64
1053     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054     _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1055   __vector signed short a, b;
1056   __vector signed int c;
1057   __vector signed int zero = {0, 0, 0, 0};
1058 
1059   a = (__vector signed short)vec_splats(__m1);
1060   b = (__vector signed short)vec_splats(__m2);
1061   c = vec_vmsumshm(a, b, zero);
1062   return (__m64)((__vector long long)c)[0];
1063 }
1064 
1065 extern __inline __m64
1066     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067     _m_pmaddwd(__m64 __m1, __m64 __m2) {
1068   return _mm_madd_pi16(__m1, __m2);
1069 }
1070 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1071    M2 and produce the high 16 bits of the 32-bit results.  */
1072 extern __inline __m64
1073     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074     _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1075   __vector signed short a, b;
1076   __vector signed short c;
1077   __vector signed int w0, w1;
1078   __vector unsigned char xform1 = {
1079 #ifdef __LITTLE_ENDIAN__
1080       0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1081       0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1082 #else
1083       0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1084       0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1085 #endif
1086   };
1087 
1088   a = (__vector signed short)vec_splats(__m1);
1089   b = (__vector signed short)vec_splats(__m2);
1090 
1091   w0 = vec_vmulesh(a, b);
1092   w1 = vec_vmulosh(a, b);
1093   c = (__vector signed short)vec_perm(w0, w1, xform1);
1094 
1095   return (__m64)((__vector long long)c)[0];
1096 }
1097 
1098 extern __inline __m64
1099     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100     _m_pmulhw(__m64 __m1, __m64 __m2) {
1101   return _mm_mulhi_pi16(__m1, __m2);
1102 }
1103 
1104 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1105    the low 16 bits of the results.  */
1106 extern __inline __m64
1107     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108     _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1109   __vector signed short a, b, c;
1110 
1111   a = (__vector signed short)vec_splats(__m1);
1112   b = (__vector signed short)vec_splats(__m2);
1113   c = a * b;
1114   return (__m64)((__vector long long)c)[0];
1115 }
1116 
1117 extern __inline __m64
1118     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119     _m_pmullw(__m64 __m1, __m64 __m2) {
1120   return _mm_mullo_pi16(__m1, __m2);
1121 }
1122 
1123 /* Shift four 16-bit values in M left by COUNT.  */
1124 extern __inline __m64
1125     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126     _mm_sll_pi16(__m64 __m, __m64 __count) {
1127   __vector signed short m, r;
1128   __vector unsigned short c;
1129 
1130   if (__count <= 15) {
1131     m = (__vector signed short)vec_splats(__m);
1132     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1133     r = vec_sl(m, (__vector unsigned short)c);
1134     return (__m64)((__vector long long)r)[0];
1135   } else
1136     return (0);
1137 }
1138 
1139 extern __inline __m64
1140     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141     _m_psllw(__m64 __m, __m64 __count) {
1142   return _mm_sll_pi16(__m, __count);
1143 }
1144 
1145 extern __inline __m64
1146     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147     _mm_slli_pi16(__m64 __m, int __count) {
1148   /* Promote int to long then invoke mm_sll_pi16.  */
1149   return _mm_sll_pi16(__m, __count);
1150 }
1151 
1152 extern __inline __m64
1153     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154     _m_psllwi(__m64 __m, int __count) {
1155   return _mm_slli_pi16(__m, __count);
1156 }
1157 
1158 /* Shift two 32-bit values in M left by COUNT.  */
1159 extern __inline __m64
1160     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161     _mm_sll_pi32(__m64 __m, __m64 __count) {
1162   __m64_union m, res;
1163 
1164   m.as_m64 = __m;
1165 
1166   res.as_int[0] = m.as_int[0] << __count;
1167   res.as_int[1] = m.as_int[1] << __count;
1168   return (res.as_m64);
1169 }
1170 
1171 extern __inline __m64
1172     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173     _m_pslld(__m64 __m, __m64 __count) {
1174   return _mm_sll_pi32(__m, __count);
1175 }
1176 
1177 extern __inline __m64
1178     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1179     _mm_slli_pi32(__m64 __m, int __count) {
1180   /* Promote int to long then invoke mm_sll_pi32.  */
1181   return _mm_sll_pi32(__m, __count);
1182 }
1183 
1184 extern __inline __m64
1185     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186     _m_pslldi(__m64 __m, int __count) {
1187   return _mm_slli_pi32(__m, __count);
1188 }
1189 
1190 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1191 extern __inline __m64
1192     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193     _mm_sra_pi16(__m64 __m, __m64 __count) {
1194   __vector signed short m, r;
1195   __vector unsigned short c;
1196 
1197   if (__count <= 15) {
1198     m = (__vector signed short)vec_splats(__m);
1199     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1200     r = vec_sra(m, (__vector unsigned short)c);
1201     return (__m64)((__vector long long)r)[0];
1202   } else
1203     return (0);
1204 }
1205 
1206 extern __inline __m64
1207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208     _m_psraw(__m64 __m, __m64 __count) {
1209   return _mm_sra_pi16(__m, __count);
1210 }
1211 
1212 extern __inline __m64
1213     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214     _mm_srai_pi16(__m64 __m, int __count) {
1215   /* Promote int to long then invoke mm_sra_pi32.  */
1216   return _mm_sra_pi16(__m, __count);
1217 }
1218 
1219 extern __inline __m64
1220     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221     _m_psrawi(__m64 __m, int __count) {
1222   return _mm_srai_pi16(__m, __count);
1223 }
1224 
1225 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1226 extern __inline __m64
1227     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228     _mm_sra_pi32(__m64 __m, __m64 __count) {
1229   __m64_union m, res;
1230 
1231   m.as_m64 = __m;
1232 
1233   res.as_int[0] = m.as_int[0] >> __count;
1234   res.as_int[1] = m.as_int[1] >> __count;
1235   return (res.as_m64);
1236 }
1237 
1238 extern __inline __m64
1239     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240     _m_psrad(__m64 __m, __m64 __count) {
1241   return _mm_sra_pi32(__m, __count);
1242 }
1243 
1244 extern __inline __m64
1245     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246     _mm_srai_pi32(__m64 __m, int __count) {
1247   /* Promote int to long then invoke mm_sra_pi32.  */
1248   return _mm_sra_pi32(__m, __count);
1249 }
1250 
1251 extern __inline __m64
1252     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253     _m_psradi(__m64 __m, int __count) {
1254   return _mm_srai_pi32(__m, __count);
1255 }
1256 
1257 /* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1258 extern __inline __m64
1259     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260     _mm_srl_pi16(__m64 __m, __m64 __count) {
1261   __vector unsigned short m, r;
1262   __vector unsigned short c;
1263 
1264   if (__count <= 15) {
1265     m = (__vector unsigned short)vec_splats(__m);
1266     c = (__vector unsigned short)vec_splats((unsigned short)__count);
1267     r = vec_sr(m, (__vector unsigned short)c);
1268     return (__m64)((__vector long long)r)[0];
1269   } else
1270     return (0);
1271 }
1272 
1273 extern __inline __m64
1274     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275     _m_psrlw(__m64 __m, __m64 __count) {
1276   return _mm_srl_pi16(__m, __count);
1277 }
1278 
1279 extern __inline __m64
1280     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281     _mm_srli_pi16(__m64 __m, int __count) {
1282   /* Promote int to long then invoke mm_sra_pi32.  */
1283   return _mm_srl_pi16(__m, __count);
1284 }
1285 
1286 extern __inline __m64
1287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288     _m_psrlwi(__m64 __m, int __count) {
1289   return _mm_srli_pi16(__m, __count);
1290 }
1291 
1292 /* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1293 extern __inline __m64
1294     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295     _mm_srl_pi32(__m64 __m, __m64 __count) {
1296   __m64_union m, res;
1297 
1298   m.as_m64 = __m;
1299 
1300   res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1301   res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1302   return (res.as_m64);
1303 }
1304 
1305 extern __inline __m64
1306     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307     _m_psrld(__m64 __m, __m64 __count) {
1308   return _mm_srl_pi32(__m, __count);
1309 }
1310 
1311 extern __inline __m64
1312     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313     _mm_srli_pi32(__m64 __m, int __count) {
1314   /* Promote int to long then invoke mm_srl_pi32.  */
1315   return _mm_srl_pi32(__m, __count);
1316 }
1317 
1318 extern __inline __m64
1319     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320     _m_psrldi(__m64 __m, int __count) {
1321   return _mm_srli_pi32(__m, __count);
1322 }
1323 #endif /* _ARCH_PWR8 */
1324 
1325 /* Creates a vector of two 32-bit values; I0 is least significant.  */
1326 extern __inline __m64
1327     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328     _mm_set_pi32(int __i1, int __i0) {
1329   __m64_union res;
1330 
1331   res.as_int[0] = __i0;
1332   res.as_int[1] = __i1;
1333   return (res.as_m64);
1334 }
1335 
1336 /* Creates a vector of four 16-bit values; W0 is least significant.  */
1337 extern __inline __m64
1338     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339     _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1340   __m64_union res;
1341 
1342   res.as_short[0] = __w0;
1343   res.as_short[1] = __w1;
1344   res.as_short[2] = __w2;
1345   res.as_short[3] = __w3;
1346   return (res.as_m64);
1347 }
1348 
1349 /* Creates a vector of eight 8-bit values; B0 is least significant.  */
1350 extern __inline __m64
1351     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352     _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1353                 char __b2, char __b1, char __b0) {
1354   __m64_union res;
1355 
1356   res.as_char[0] = __b0;
1357   res.as_char[1] = __b1;
1358   res.as_char[2] = __b2;
1359   res.as_char[3] = __b3;
1360   res.as_char[4] = __b4;
1361   res.as_char[5] = __b5;
1362   res.as_char[6] = __b6;
1363   res.as_char[7] = __b7;
1364   return (res.as_m64);
1365 }
1366 
1367 /* Similar, but with the arguments in reverse order.  */
1368 extern __inline __m64
1369     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370     _mm_setr_pi32(int __i0, int __i1) {
1371   __m64_union res;
1372 
1373   res.as_int[0] = __i0;
1374   res.as_int[1] = __i1;
1375   return (res.as_m64);
1376 }
1377 
1378 extern __inline __m64
1379     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380     _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1381   return _mm_set_pi16(__w3, __w2, __w1, __w0);
1382 }
1383 
1384 extern __inline __m64
1385     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386     _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1387                  char __b5, char __b6, char __b7) {
1388   return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1389 }
1390 
1391 /* Creates a vector of two 32-bit values, both elements containing I.  */
1392 extern __inline __m64
1393     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394     _mm_set1_pi32(int __i) {
1395   __m64_union res;
1396 
1397   res.as_int[0] = __i;
1398   res.as_int[1] = __i;
1399   return (res.as_m64);
1400 }
1401 
1402 /* Creates a vector of four 16-bit values, all elements containing W.  */
1403 extern __inline __m64
1404     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405     _mm_set1_pi16(short __w) {
1406 #if _ARCH_PWR9
1407   __vector signed short w;
1408 
1409   w = (__vector signed short)vec_splats(__w);
1410   return (__m64)((__vector long long)w)[0];
1411 #else
1412   __m64_union res;
1413 
1414   res.as_short[0] = __w;
1415   res.as_short[1] = __w;
1416   res.as_short[2] = __w;
1417   res.as_short[3] = __w;
1418   return (res.as_m64);
1419 #endif
1420 }
1421 
1422 /* Creates a vector of eight 8-bit values, all elements containing B.  */
1423 extern __inline __m64
1424     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425     _mm_set1_pi8(signed char __b) {
1426 #if _ARCH_PWR8
1427   __vector signed char b;
1428 
1429   b = (__vector signed char)vec_splats(__b);
1430   return (__m64)((__vector long long)b)[0];
1431 #else
1432   __m64_union res;
1433 
1434   res.as_char[0] = __b;
1435   res.as_char[1] = __b;
1436   res.as_char[2] = __b;
1437   res.as_char[3] = __b;
1438   res.as_char[4] = __b;
1439   res.as_char[5] = __b;
1440   res.as_char[6] = __b;
1441   res.as_char[7] = __b;
1442   return (res.as_m64);
1443 #endif
1444 }
1445 
1446 #else
1447 #include_next <mmintrin.h>
1448 #endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))   \
1449         */
1450 
1451 #endif /* _MMINTRIN_H_INCLUDED */
1452