xref: /freebsd/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/smmintrin.h (revision 8f7ed58a15556bf567ff876e1999e4fe4d684e1d)
1 /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.
12 
13    NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
14 
15 #ifndef NO_WARN_X86_INTRINSICS
16 /* This header is distributed to simplify porting x86_64 code that
17    makes explicit use of Intel intrinsics to powerp64/powerpc64le.
18 
19    It is the user's responsibility to determine if the results are
20    acceptable and make additional changes as necessary.
21 
22    Note that much code that uses Intel intrinsics can be rewritten in
23    standard C or GNU C extensions, which are more portable and better
24    optimized across multiple targets.  */
25 #error                                                                         \
26     "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
27 #endif
28 
29 #ifndef SMMINTRIN_H_
30 #define SMMINTRIN_H_
31 
32 #if defined(__powerpc64__) &&                                                  \
33     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
34 
35 #include <altivec.h>
36 #include <tmmintrin.h>
37 
38 /* Rounding mode macros. */
39 #define _MM_FROUND_TO_NEAREST_INT 0x00
40 #define _MM_FROUND_TO_ZERO 0x01
41 #define _MM_FROUND_TO_POS_INF 0x02
42 #define _MM_FROUND_TO_NEG_INF 0x03
43 #define _MM_FROUND_CUR_DIRECTION 0x04
44 
45 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
46 #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
47 #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
48 #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
49 #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
50 #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
51 
52 #define _MM_FROUND_RAISE_EXC 0x00
53 #define _MM_FROUND_NO_EXC 0x08
54 
55 extern __inline __m128d
56     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
57     _mm_round_pd(__m128d __A, int __rounding) {
58   __v2df __r;
59   union {
60     double __fr;
61     long long __fpscr;
62   } __enables_save, __fpscr_save;
63 
64   if (__rounding & _MM_FROUND_NO_EXC) {
65     /* Save enabled exceptions, disable all exceptions,
66        and preserve the rounding mode.  */
67 #ifdef _ARCH_PWR9
68     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
69     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
70 #else
71     __fpscr_save.__fr = __builtin_mffs();
72     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
73     __fpscr_save.__fpscr &= ~0xf8;
74     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
75 #endif
76     /* Insert an artificial "read/write" reference to the variable
77        read below, to ensure the compiler does not schedule
78        a read/use of the variable before the FPSCR is modified, above.
79        This can be removed if and when GCC PR102783 is fixed.
80      */
81     __asm__("" : "+wa"(__A));
82   }
83 
84   switch (__rounding) {
85   case _MM_FROUND_TO_NEAREST_INT:
86     __fpscr_save.__fr = __builtin_mffsl();
87     __attribute__((fallthrough));
88   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
89     __builtin_set_fpscr_rn(0b00);
90     /* Insert an artificial "read/write" reference to the variable
91        read below, to ensure the compiler does not schedule
92        a read/use of the variable before the FPSCR is modified, above.
93        This can be removed if and when GCC PR102783 is fixed.
94      */
95     __asm__("" : "+wa"(__A));
96 
97     __r = vec_rint((__v2df)__A);
98 
99     /* Insert an artificial "read" reference to the variable written
100        above, to ensure the compiler does not schedule the computation
101        of the value after the manipulation of the FPSCR, below.
102        This can be removed if and when GCC PR102783 is fixed.
103      */
104     __asm__("" : : "wa"(__r));
105     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
106     break;
107   case _MM_FROUND_TO_NEG_INF:
108   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
109     __r = vec_floor((__v2df)__A);
110     break;
111   case _MM_FROUND_TO_POS_INF:
112   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
113     __r = vec_ceil((__v2df)__A);
114     break;
115   case _MM_FROUND_TO_ZERO:
116   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
117     __r = vec_trunc((__v2df)__A);
118     break;
119   case _MM_FROUND_CUR_DIRECTION:
120     __r = vec_rint((__v2df)__A);
121     break;
122   }
123   if (__rounding & _MM_FROUND_NO_EXC) {
124     /* Insert an artificial "read" reference to the variable written
125        above, to ensure the compiler does not schedule the computation
126        of the value after the manipulation of the FPSCR, below.
127        This can be removed if and when GCC PR102783 is fixed.
128      */
129     __asm__("" : : "wa"(__r));
130     /* Restore enabled exceptions.  */
131     __fpscr_save.__fr = __builtin_mffsl();
132     __fpscr_save.__fpscr |= __enables_save.__fpscr;
133     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
134   }
135   return (__m128d)__r;
136 }
137 
138 extern __inline __m128d
139     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140     _mm_round_sd(__m128d __A, __m128d __B, int __rounding) {
141   __B = _mm_round_pd(__B, __rounding);
142   __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]};
143   return (__m128d)__r;
144 }
145 
146 extern __inline __m128
147     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
148     _mm_round_ps(__m128 __A, int __rounding) {
149   __v4sf __r;
150   union {
151     double __fr;
152     long long __fpscr;
153   } __enables_save, __fpscr_save;
154 
155   if (__rounding & _MM_FROUND_NO_EXC) {
156     /* Save enabled exceptions, disable all exceptions,
157        and preserve the rounding mode.  */
158 #ifdef _ARCH_PWR9
159     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
160     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
161 #else
162     __fpscr_save.__fr = __builtin_mffs();
163     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
164     __fpscr_save.__fpscr &= ~0xf8;
165     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
166 #endif
167     /* Insert an artificial "read/write" reference to the variable
168        read below, to ensure the compiler does not schedule
169        a read/use of the variable before the FPSCR is modified, above.
170        This can be removed if and when GCC PR102783 is fixed.
171      */
172     __asm__("" : "+wa"(__A));
173   }
174 
175   switch (__rounding) {
176   case _MM_FROUND_TO_NEAREST_INT:
177     __fpscr_save.__fr = __builtin_mffsl();
178     __attribute__((fallthrough));
179   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
180     __builtin_set_fpscr_rn(0b00);
181     /* Insert an artificial "read/write" reference to the variable
182        read below, to ensure the compiler does not schedule
183        a read/use of the variable before the FPSCR is modified, above.
184        This can be removed if and when GCC PR102783 is fixed.
185      */
186     __asm__("" : "+wa"(__A));
187 
188     __r = vec_rint((__v4sf)__A);
189 
190     /* Insert an artificial "read" reference to the variable written
191        above, to ensure the compiler does not schedule the computation
192        of the value after the manipulation of the FPSCR, below.
193        This can be removed if and when GCC PR102783 is fixed.
194      */
195     __asm__("" : : "wa"(__r));
196     __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
197     break;
198   case _MM_FROUND_TO_NEG_INF:
199   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
200     __r = vec_floor((__v4sf)__A);
201     break;
202   case _MM_FROUND_TO_POS_INF:
203   case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
204     __r = vec_ceil((__v4sf)__A);
205     break;
206   case _MM_FROUND_TO_ZERO:
207   case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
208     __r = vec_trunc((__v4sf)__A);
209     break;
210   case _MM_FROUND_CUR_DIRECTION:
211     __r = vec_rint((__v4sf)__A);
212     break;
213   }
214   if (__rounding & _MM_FROUND_NO_EXC) {
215     /* Insert an artificial "read" reference to the variable written
216        above, to ensure the compiler does not schedule the computation
217        of the value after the manipulation of the FPSCR, below.
218        This can be removed if and when GCC PR102783 is fixed.
219      */
220     __asm__("" : : "wa"(__r));
221     /* Restore enabled exceptions.  */
222     __fpscr_save.__fr = __builtin_mffsl();
223     __fpscr_save.__fpscr |= __enables_save.__fpscr;
224     __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
225   }
226   return (__m128)__r;
227 }
228 
229 extern __inline __m128
230     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
231     _mm_round_ss(__m128 __A, __m128 __B, int __rounding) {
232   __B = _mm_round_ps(__B, __rounding);
233   __v4sf __r = (__v4sf)__A;
234   __r[0] = ((__v4sf)__B)[0];
235   return (__m128)__r;
236 }
237 
238 #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL)
239 #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL)
240 
241 #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
242 #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR)
243 
244 #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL)
245 #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL)
246 
247 #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR)
248 #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR)
249 
250 extern __inline __m128i
251     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
252     _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
253   __v16qi __result = (__v16qi)__A;
254 
255   __result[__N & 0xf] = __D;
256 
257   return (__m128i)__result;
258 }
259 
260 extern __inline __m128i
261     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
262     _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
263   __v4si __result = (__v4si)__A;
264 
265   __result[__N & 3] = __D;
266 
267   return (__m128i)__result;
268 }
269 
270 extern __inline __m128i
271     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
272     _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
273   __v2di __result = (__v2di)__A;
274 
275   __result[__N & 1] = __D;
276 
277   return (__m128i)__result;
278 }
279 
280 extern __inline int
281     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282     _mm_extract_epi8(__m128i __X, const int __N) {
283   return (unsigned char)((__v16qi)__X)[__N & 15];
284 }
285 
286 extern __inline int
287     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288     _mm_extract_epi32(__m128i __X, const int __N) {
289   return ((__v4si)__X)[__N & 3];
290 }
291 
292 extern __inline int
293     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294     _mm_extract_epi64(__m128i __X, const int __N) {
295   return ((__v2di)__X)[__N & 1];
296 }
297 
298 extern __inline int
299     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300     _mm_extract_ps(__m128 __X, const int __N) {
301   return ((__v4si)__X)[__N & 3];
302 }
303 
304 #ifdef _ARCH_PWR8
305 extern __inline __m128i
306     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
307     _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
308   __v16qu __charmask = vec_splats((unsigned char)__imm8);
309   __charmask = vec_gb(__charmask);
310   __v8hu __shortmask = (__v8hu)vec_unpackh((__v16qi)__charmask);
311 #ifdef __BIG_ENDIAN__
312   __shortmask = vec_reve(__shortmask);
313 #endif
314   return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
315 }
316 #endif
317 
318 extern __inline __m128i
319     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
320     _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
321 #ifdef _ARCH_PWR10
322   return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask);
323 #else
324   const __v16qu __seven = vec_splats((unsigned char)0x07);
325   __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
326   return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask);
327 #endif
328 }
329 
330 extern __inline __m128
331     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332     _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) {
333   __v16qu __pcv[] = {
334       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
335       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
336       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
337       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
338       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
339       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15},
340       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
341       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15},
342       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
343       {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31},
344       {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
345       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31},
346       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
347       {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
348       {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
349       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
350   };
351   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
352   return (__m128)__r;
353 }
354 
355 extern __inline __m128
356     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
357     _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) {
358 #ifdef _ARCH_PWR10
359   return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask);
360 #else
361   const __v4si __zero = {0};
362   const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero);
363   return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask);
364 #endif
365 }
366 
367 extern __inline __m128d
368     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369     _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) {
370   __v16qu __pcv[] = {
371       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
372       {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15},
373       {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31},
374       {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}};
375   __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]);
376   return (__m128d)__r;
377 }
378 
379 #ifdef _ARCH_PWR8
380 extern __inline __m128d
381     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
382     _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) {
383 #ifdef _ARCH_PWR10
384   return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask);
385 #else
386   const __v2di __zero = {0};
387   const __vector __bool long long __boolmask =
388       vec_cmplt((__v2di)__mask, __zero);
389   return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask);
390 #endif
391 }
392 #endif
393 
394 extern __inline int
395     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
396     _mm_testz_si128(__m128i __A, __m128i __B) {
397   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
398   const __v16qu __zero = {0};
399   return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero);
400 }
401 
402 extern __inline int
403     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404     _mm_testc_si128(__m128i __A, __m128i __B) {
405   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
406   const __v16qu __zero = {0};
407   const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A);
408   return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero);
409 }
410 
411 extern __inline int
412     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413     _mm_testnzc_si128(__m128i __A, __m128i __B) {
414   /* Note: This implementation does NOT set "zero" or "carry" flags.  */
415   return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0;
416 }
417 
418 #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
419 
420 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
421 
422 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
423 
424 #ifdef _ARCH_PWR8
425 extern __inline __m128i
426     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
427     _mm_cmpeq_epi64(__m128i __X, __m128i __Y) {
428   return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y);
429 }
430 #endif
431 
432 extern __inline __m128i
433     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434     _mm_min_epi8(__m128i __X, __m128i __Y) {
435   return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y);
436 }
437 
438 extern __inline __m128i
439     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
440     _mm_min_epu16(__m128i __X, __m128i __Y) {
441   return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y);
442 }
443 
444 extern __inline __m128i
445     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
446     _mm_min_epi32(__m128i __X, __m128i __Y) {
447   return (__m128i)vec_min((__v4si)__X, (__v4si)__Y);
448 }
449 
450 extern __inline __m128i
451     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
452     _mm_min_epu32(__m128i __X, __m128i __Y) {
453   return (__m128i)vec_min((__v4su)__X, (__v4su)__Y);
454 }
455 
456 extern __inline __m128i
457     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458     _mm_max_epi8(__m128i __X, __m128i __Y) {
459   return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y);
460 }
461 
462 extern __inline __m128i
463     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
464     _mm_max_epu16(__m128i __X, __m128i __Y) {
465   return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y);
466 }
467 
468 extern __inline __m128i
469     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
470     _mm_max_epi32(__m128i __X, __m128i __Y) {
471   return (__m128i)vec_max((__v4si)__X, (__v4si)__Y);
472 }
473 
474 extern __inline __m128i
475     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
476     _mm_max_epu32(__m128i __X, __m128i __Y) {
477   return (__m128i)vec_max((__v4su)__X, (__v4su)__Y);
478 }
479 
480 extern __inline __m128i
481     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
482     _mm_mullo_epi32(__m128i __X, __m128i __Y) {
483   return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y);
484 }
485 
486 #ifdef _ARCH_PWR8
487 extern __inline __m128i
488     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489     _mm_mul_epi32(__m128i __X, __m128i __Y) {
490   return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y);
491 }
492 #endif
493 
494 extern __inline __m128i
495     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
496     _mm_cvtepi8_epi16(__m128i __A) {
497   return (__m128i)vec_unpackh((__v16qi)__A);
498 }
499 
500 extern __inline __m128i
501     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
502     _mm_cvtepi8_epi32(__m128i __A) {
503   __A = (__m128i)vec_unpackh((__v16qi)__A);
504   return (__m128i)vec_unpackh((__v8hi)__A);
505 }
506 
507 #ifdef _ARCH_PWR8
508 extern __inline __m128i
509     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
510     _mm_cvtepi8_epi64(__m128i __A) {
511   __A = (__m128i)vec_unpackh((__v16qi)__A);
512   __A = (__m128i)vec_unpackh((__v8hi)__A);
513   return (__m128i)vec_unpackh((__v4si)__A);
514 }
515 #endif
516 
517 extern __inline __m128i
518     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519     _mm_cvtepi16_epi32(__m128i __A) {
520   return (__m128i)vec_unpackh((__v8hi)__A);
521 }
522 
523 #ifdef _ARCH_PWR8
524 extern __inline __m128i
525     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526     _mm_cvtepi16_epi64(__m128i __A) {
527   __A = (__m128i)vec_unpackh((__v8hi)__A);
528   return (__m128i)vec_unpackh((__v4si)__A);
529 }
530 #endif
531 
532 #ifdef _ARCH_PWR8
533 extern __inline __m128i
534     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
535     _mm_cvtepi32_epi64(__m128i __A) {
536   return (__m128i)vec_unpackh((__v4si)__A);
537 }
538 #endif
539 
540 extern __inline __m128i
541     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
542     _mm_cvtepu8_epi16(__m128i __A) {
543   const __v16qu __zero = {0};
544 #ifdef __LITTLE_ENDIAN__
545   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
546 #else  /* __BIG_ENDIAN__.  */
547   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
548 #endif /* __BIG_ENDIAN__.  */
549   return __A;
550 }
551 
552 extern __inline __m128i
553     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
554     _mm_cvtepu8_epi32(__m128i __A) {
555   const __v16qu __zero = {0};
556 #ifdef __LITTLE_ENDIAN__
557   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
558   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
559 #else  /* __BIG_ENDIAN__.  */
560   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
561   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
562 #endif /* __BIG_ENDIAN__.  */
563   return __A;
564 }
565 
566 extern __inline __m128i
567     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568     _mm_cvtepu8_epi64(__m128i __A) {
569   const __v16qu __zero = {0};
570 #ifdef __LITTLE_ENDIAN__
571   __A = (__m128i)vec_mergeh((__v16qu)__A, __zero);
572   __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero);
573   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
574 #else  /* __BIG_ENDIAN__.  */
575   __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A);
576   __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A);
577   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
578 #endif /* __BIG_ENDIAN__.  */
579   return __A;
580 }
581 
582 extern __inline __m128i
583     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
584     _mm_cvtepu16_epi32(__m128i __A) {
585   const __v8hu __zero = {0};
586 #ifdef __LITTLE_ENDIAN__
587   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
588 #else  /* __BIG_ENDIAN__.  */
589   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
590 #endif /* __BIG_ENDIAN__.  */
591   return __A;
592 }
593 
594 extern __inline __m128i
595     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
596     _mm_cvtepu16_epi64(__m128i __A) {
597   const __v8hu __zero = {0};
598 #ifdef __LITTLE_ENDIAN__
599   __A = (__m128i)vec_mergeh((__v8hu)__A, __zero);
600   __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero);
601 #else  /* __BIG_ENDIAN__.  */
602   __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A);
603   __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A);
604 #endif /* __BIG_ENDIAN__.  */
605   return __A;
606 }
607 
608 extern __inline __m128i
609     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610     _mm_cvtepu32_epi64(__m128i __A) {
611   const __v4su __zero = {0};
612 #ifdef __LITTLE_ENDIAN__
613   __A = (__m128i)vec_mergeh((__v4su)__A, __zero);
614 #else  /* __BIG_ENDIAN__.  */
615   __A = (__m128i)vec_mergeh(__zero, (__v4su)__A);
616 #endif /* __BIG_ENDIAN__.  */
617   return __A;
618 }
619 
620 /* Return horizontal packed word minimum and its index in bits [15:0]
621    and bits [18:16] respectively.  */
622 extern __inline __m128i
623     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624     _mm_minpos_epu16(__m128i __A) {
625   union __u {
626     __m128i __m;
627     __v8hu __uh;
628   };
629   union __u __u = {.__m = __A}, __r = {.__m = {0}};
630   unsigned short __ridx = 0;
631   unsigned short __rmin = __u.__uh[__ridx];
632   unsigned long __i;
633   for (__i = 1; __i < 8; __i++) {
634     if (__u.__uh[__i] < __rmin) {
635       __rmin = __u.__uh[__i];
636       __ridx = __i;
637     }
638   }
639   __r.__uh[0] = __rmin;
640   __r.__uh[1] = __ridx;
641   return __r.__m;
642 }
643 
644 extern __inline __m128i
645     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
646     _mm_packus_epi32(__m128i __X, __m128i __Y) {
647   return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y);
648 }
649 
650 #ifdef _ARCH_PWR8
651 extern __inline __m128i
652     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
653     _mm_cmpgt_epi64(__m128i __X, __m128i __Y) {
654   return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y);
655 }
656 #endif
657 
658 #else
659 #include_next <smmintrin.h>
660 #endif /* defined(__powerpc64__) &&                                            \
661         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
662 
663 #endif /* SMMINTRIN_H_ */
664