xref: /freebsd/contrib/llvm-project/clang/lib/Headers/ppc_wrappers/tmmintrin.h (revision f5f40dd63bc7acbb5312b26ac1ea1103c12352a6)
1 /*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 /* Implemented from the specification included in the Intel C++ Compiler
11    User Guide and Reference, version 9.0.  */
12 
13 #ifndef NO_WARN_X86_INTRINSICS
14 /* This header is distributed to simplify porting x86_64 code that
15    makes explicit use of Intel intrinsics to powerpc64le.
16 
17    It is the user's responsibility to determine if the results are
18    acceptable and make additional changes as necessary.
19 
20    Note that much code that uses Intel intrinsics can be rewritten in
21    standard C or GNU C extensions, which are more portable and better
22    optimized across multiple targets.  */
23 #endif
24 
25 #ifndef TMMINTRIN_H_
26 #define TMMINTRIN_H_
27 
28 #if defined(__powerpc64__) &&                                                  \
29     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
30 
31 #include <altivec.h>
32 
33 /* We need definitions from the SSE header files.  */
34 #include <pmmintrin.h>
35 
36 extern __inline __m128i
37     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
38     _mm_abs_epi16(__m128i __A) {
39   return (__m128i)vec_abs((__v8hi)__A);
40 }
41 
42 extern __inline __m128i
43     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
44     _mm_abs_epi32(__m128i __A) {
45   return (__m128i)vec_abs((__v4si)__A);
46 }
47 
48 extern __inline __m128i
49     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
50     _mm_abs_epi8(__m128i __A) {
51   return (__m128i)vec_abs((__v16qi)__A);
52 }
53 
54 extern __inline __m64
55     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56     _mm_abs_pi16(__m64 __A) {
57   __v8hi __B = (__v8hi)(__v2du){__A, __A};
58   return (__m64)((__v2du)vec_abs(__B))[0];
59 }
60 
61 extern __inline __m64
62     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
63     _mm_abs_pi32(__m64 __A) {
64   __v4si __B = (__v4si)(__v2du){__A, __A};
65   return (__m64)((__v2du)vec_abs(__B))[0];
66 }
67 
68 extern __inline __m64
69     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70     _mm_abs_pi8(__m64 __A) {
71   __v16qi __B = (__v16qi)(__v2du){__A, __A};
72   return (__m64)((__v2du)vec_abs(__B))[0];
73 }
74 
75 extern __inline __m128i
76     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
77     _mm_alignr_epi8(__m128i __A, __m128i __B, const unsigned int __count) {
78   if (__builtin_constant_p(__count) && __count < 16) {
79 #ifdef __LITTLE_ENDIAN__
80     __A = (__m128i)vec_reve((__v16qu)__A);
81     __B = (__m128i)vec_reve((__v16qu)__B);
82 #endif
83     __A = (__m128i)vec_sld((__v16qu)__B, (__v16qu)__A, __count);
84 #ifdef __LITTLE_ENDIAN__
85     __A = (__m128i)vec_reve((__v16qu)__A);
86 #endif
87     return __A;
88   }
89 
90   if (__count == 0)
91     return __B;
92 
93   if (__count >= 16) {
94     if (__count >= 32) {
95       const __v16qu __zero = {0};
96       return (__m128i)__zero;
97     } else {
98       const __v16qu __shift = vec_splats((unsigned char)((__count - 16) * 8));
99 #ifdef __LITTLE_ENDIAN__
100       return (__m128i)vec_sro((__v16qu)__A, __shift);
101 #else
102       return (__m128i)vec_slo((__v16qu)__A, __shift);
103 #endif
104     }
105   } else {
106     const __v16qu __shiftA = vec_splats((unsigned char)((16 - __count) * 8));
107     const __v16qu __shiftB = vec_splats((unsigned char)(__count * 8));
108 #ifdef __LITTLE_ENDIAN__
109     __A = (__m128i)vec_slo((__v16qu)__A, __shiftA);
110     __B = (__m128i)vec_sro((__v16qu)__B, __shiftB);
111 #else
112     __A = (__m128i)vec_sro((__v16qu)__A, __shiftA);
113     __B = (__m128i)vec_slo((__v16qu)__B, __shiftB);
114 #endif
115     return (__m128i)vec_or((__v16qu)__A, (__v16qu)__B);
116   }
117 }
118 
119 extern __inline __m64
120     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
121     _mm_alignr_pi8(__m64 __A, __m64 __B, unsigned int __count) {
122   if (__count < 16) {
123     __v2du __C = {__B, __A};
124 #ifdef __LITTLE_ENDIAN__
125     const __v4su __shift = {__count << 3, 0, 0, 0};
126     __C = (__v2du)vec_sro((__v16qu)__C, (__v16qu)__shift);
127 #else
128     const __v4su __shift = {0, 0, 0, __count << 3};
129     __C = (__v2du)vec_slo((__v16qu)__C, (__v16qu)__shift);
130 #endif
131     return (__m64)__C[0];
132   } else {
133     const __m64 __zero = {0};
134     return __zero;
135   }
136 }
137 
138 extern __inline __m128i
139     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140     _mm_hadd_epi16(__m128i __A, __m128i __B) {
141   const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
142                        16, 17, 20, 21, 24, 25, 28, 29};
143   const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
144                        18, 19, 22, 23, 26, 27, 30, 31};
145   __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
146   __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
147   return (__m128i)vec_add(__C, __D);
148 }
149 
150 extern __inline __m128i
151     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152     _mm_hadd_epi32(__m128i __A, __m128i __B) {
153   const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
154                        16, 17, 18, 19, 24, 25, 26, 27};
155   const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
156                        20, 21, 22, 23, 28, 29, 30, 31};
157   __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
158   __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
159   return (__m128i)vec_add(__C, __D);
160 }
161 
162 extern __inline __m64
163     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
164     _mm_hadd_pi16(__m64 __A, __m64 __B) {
165   __v8hi __C = (__v8hi)(__v2du){__A, __B};
166   const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
167   const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
168   __v8hi __D = vec_perm(__C, __C, __Q);
169   __C = vec_perm(__C, __C, __P);
170   __C = vec_add(__C, __D);
171   return (__m64)((__v2du)__C)[1];
172 }
173 
174 extern __inline __m64
175     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176     _mm_hadd_pi32(__m64 __A, __m64 __B) {
177   __v4si __C = (__v4si)(__v2du){__A, __B};
178   const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
179   const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
180   __v4si __D = vec_perm(__C, __C, __Q);
181   __C = vec_perm(__C, __C, __P);
182   __C = vec_add(__C, __D);
183   return (__m64)((__v2du)__C)[1];
184 }
185 
186 extern __inline __m128i
187     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188     _mm_hadds_epi16(__m128i __A, __m128i __B) {
189   __v4si __C = {0}, __D = {0};
190   __C = vec_sum4s((__v8hi)__A, __C);
191   __D = vec_sum4s((__v8hi)__B, __D);
192   __C = (__v4si)vec_packs(__C, __D);
193   return (__m128i)__C;
194 }
195 
196 extern __inline __m64
197     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
198     _mm_hadds_pi16(__m64 __A, __m64 __B) {
199   const __v4si __zero = {0};
200   __v8hi __C = (__v8hi)(__v2du){__A, __B};
201   __v4si __D = vec_sum4s(__C, __zero);
202   __C = vec_packs(__D, __D);
203   return (__m64)((__v2du)__C)[1];
204 }
205 
206 extern __inline __m128i
207     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208     _mm_hsub_epi16(__m128i __A, __m128i __B) {
209   const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
210                        16, 17, 20, 21, 24, 25, 28, 29};
211   const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
212                        18, 19, 22, 23, 26, 27, 30, 31};
213   __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
214   __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
215   return (__m128i)vec_sub(__C, __D);
216 }
217 
218 extern __inline __m128i
219     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
220     _mm_hsub_epi32(__m128i __A, __m128i __B) {
221   const __v16qu __P = {0,  1,  2,  3,  8,  9,  10, 11,
222                        16, 17, 18, 19, 24, 25, 26, 27};
223   const __v16qu __Q = {4,  5,  6,  7,  12, 13, 14, 15,
224                        20, 21, 22, 23, 28, 29, 30, 31};
225   __v4si __C = vec_perm((__v4si)__A, (__v4si)__B, __P);
226   __v4si __D = vec_perm((__v4si)__A, (__v4si)__B, __Q);
227   return (__m128i)vec_sub(__C, __D);
228 }
229 
230 extern __inline __m64
231     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
232     _mm_hsub_pi16(__m64 __A, __m64 __B) {
233   const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
234   const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
235   __v8hi __C = (__v8hi)(__v2du){__A, __B};
236   __v8hi __D = vec_perm(__C, __C, __Q);
237   __C = vec_perm(__C, __C, __P);
238   __C = vec_sub(__C, __D);
239   return (__m64)((__v2du)__C)[1];
240 }
241 
242 extern __inline __m64
243     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
244     _mm_hsub_pi32(__m64 __A, __m64 __B) {
245   const __v16qu __P = {0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11};
246   const __v16qu __Q = {4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15};
247   __v4si __C = (__v4si)(__v2du){__A, __B};
248   __v4si __D = vec_perm(__C, __C, __Q);
249   __C = vec_perm(__C, __C, __P);
250   __C = vec_sub(__C, __D);
251   return (__m64)((__v2du)__C)[1];
252 }
253 
254 extern __inline __m128i
255     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
256     _mm_hsubs_epi16(__m128i __A, __m128i __B) {
257   const __v16qu __P = {0,  1,  4,  5,  8,  9,  12, 13,
258                        16, 17, 20, 21, 24, 25, 28, 29};
259   const __v16qu __Q = {2,  3,  6,  7,  10, 11, 14, 15,
260                        18, 19, 22, 23, 26, 27, 30, 31};
261   __v8hi __C = vec_perm((__v8hi)__A, (__v8hi)__B, __P);
262   __v8hi __D = vec_perm((__v8hi)__A, (__v8hi)__B, __Q);
263   return (__m128i)vec_subs(__C, __D);
264 }
265 
266 extern __inline __m64
267     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268     _mm_hsubs_pi16(__m64 __A, __m64 __B) {
269   const __v16qu __P = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
270   const __v16qu __Q = {2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15};
271   __v8hi __C = (__v8hi)(__v2du){__A, __B};
272   __v8hi __D = vec_perm(__C, __C, __P);
273   __v8hi __E = vec_perm(__C, __C, __Q);
274   __C = vec_subs(__D, __E);
275   return (__m64)((__v2du)__C)[1];
276 }
277 
278 extern __inline __m128i
279     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
280     _mm_shuffle_epi8(__m128i __A, __m128i __B) {
281   const __v16qi __zero = {0};
282   __vector __bool char __select = vec_cmplt((__v16qi)__B, __zero);
283   __v16qi __C = vec_perm((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
284   return (__m128i)vec_sel(__C, __zero, __select);
285 }
286 
287 extern __inline __m64
288     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289     _mm_shuffle_pi8(__m64 __A, __m64 __B) {
290   const __v16qi __zero = {0};
291   __v16qi __C = (__v16qi)(__v2du){__A, __A};
292   __v16qi __D = (__v16qi)(__v2du){__B, __B};
293   __vector __bool char __select = vec_cmplt((__v16qi)__D, __zero);
294   __C = vec_perm((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
295   __C = vec_sel(__C, __zero, __select);
296   return (__m64)((__v2du)(__C))[0];
297 }
298 
299 #ifdef _ARCH_PWR8
300 extern __inline __m128i
301     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302     _mm_sign_epi8(__m128i __A, __m128i __B) {
303   const __v16qi __zero = {0};
304   __v16qi __selectneg = (__v16qi)vec_cmplt((__v16qi)__B, __zero);
305   __v16qi __selectpos =
306       (__v16qi)vec_neg((__v16qi)vec_cmpgt((__v16qi)__B, __zero));
307   __v16qi __conv = vec_add(__selectneg, __selectpos);
308   return (__m128i)vec_mul((__v16qi)__A, (__v16qi)__conv);
309 }
310 #endif
311 
312 #ifdef _ARCH_PWR8
313 extern __inline __m128i
314     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
315     _mm_sign_epi16(__m128i __A, __m128i __B) {
316   const __v8hi __zero = {0};
317   __v8hi __selectneg = (__v8hi)vec_cmplt((__v8hi)__B, __zero);
318   __v8hi __selectpos = (__v8hi)vec_neg((__v8hi)vec_cmpgt((__v8hi)__B, __zero));
319   __v8hi __conv = vec_add(__selectneg, __selectpos);
320   return (__m128i)vec_mul((__v8hi)__A, (__v8hi)__conv);
321 }
322 #endif
323 
324 #ifdef _ARCH_PWR8
325 extern __inline __m128i
326     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327     _mm_sign_epi32(__m128i __A, __m128i __B) {
328   const __v4si __zero = {0};
329   __v4si __selectneg = (__v4si)vec_cmplt((__v4si)__B, __zero);
330   __v4si __selectpos = (__v4si)vec_neg((__v4si)vec_cmpgt((__v4si)__B, __zero));
331   __v4si __conv = vec_add(__selectneg, __selectpos);
332   return (__m128i)vec_mul((__v4si)__A, (__v4si)__conv);
333 }
334 #endif
335 
336 #ifdef _ARCH_PWR8
337 extern __inline __m64
338     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
339     _mm_sign_pi8(__m64 __A, __m64 __B) {
340   const __v16qi __zero = {0};
341   __v16qi __C = (__v16qi)(__v2du){__A, __A};
342   __v16qi __D = (__v16qi)(__v2du){__B, __B};
343   __C = (__v16qi)_mm_sign_epi8((__m128i)__C, (__m128i)__D);
344   return (__m64)((__v2du)(__C))[0];
345 }
346 #endif
347 
348 #ifdef _ARCH_PWR8
349 extern __inline __m64
350     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
351     _mm_sign_pi16(__m64 __A, __m64 __B) {
352   const __v8hi __zero = {0};
353   __v8hi __C = (__v8hi)(__v2du){__A, __A};
354   __v8hi __D = (__v8hi)(__v2du){__B, __B};
355   __C = (__v8hi)_mm_sign_epi16((__m128i)__C, (__m128i)__D);
356   return (__m64)((__v2du)(__C))[0];
357 }
358 #endif
359 
360 #ifdef _ARCH_PWR8
361 extern __inline __m64
362     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363     _mm_sign_pi32(__m64 __A, __m64 __B) {
364   const __v4si __zero = {0};
365   __v4si __C = (__v4si)(__v2du){__A, __A};
366   __v4si __D = (__v4si)(__v2du){__B, __B};
367   __C = (__v4si)_mm_sign_epi32((__m128i)__C, (__m128i)__D);
368   return (__m64)((__v2du)(__C))[0];
369 }
370 #endif
371 
372 extern __inline __m128i
373     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
374     _mm_maddubs_epi16(__m128i __A, __m128i __B) {
375   __v8hi __unsigned = vec_splats((signed short)0x00ff);
376   __v8hi __C = vec_and(vec_unpackh((__v16qi)__A), __unsigned);
377   __v8hi __D = vec_and(vec_unpackl((__v16qi)__A), __unsigned);
378   __v8hi __E = vec_unpackh((__v16qi)__B);
379   __v8hi __F = vec_unpackl((__v16qi)__B);
380   __C = vec_mul(__C, __E);
381   __D = vec_mul(__D, __F);
382   const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
383                           16, 17, 20, 21, 24, 25, 28, 29};
384   const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
385                            18, 19, 22, 23, 26, 27, 30, 31};
386   __E = vec_perm(__C, __D, __odds);
387   __F = vec_perm(__C, __D, __evens);
388   return (__m128i)vec_adds(__E, __F);
389 }
390 
391 extern __inline __m64
392     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
393     _mm_maddubs_pi16(__m64 __A, __m64 __B) {
394   __v8hi __C = (__v8hi)(__v2du){__A, __A};
395   __C = vec_unpackl((__v16qi)__C);
396   const __v8hi __unsigned = vec_splats((signed short)0x00ff);
397   __C = vec_and(__C, __unsigned);
398   __v8hi __D = (__v8hi)(__v2du){__B, __B};
399   __D = vec_unpackl((__v16qi)__D);
400   __D = vec_mul(__C, __D);
401   const __v16qu __odds = {0,  1,  4,  5,  8,  9,  12, 13,
402                           16, 17, 20, 21, 24, 25, 28, 29};
403   const __v16qu __evens = {2,  3,  6,  7,  10, 11, 14, 15,
404                            18, 19, 22, 23, 26, 27, 30, 31};
405   __C = vec_perm(__D, __D, __odds);
406   __D = vec_perm(__D, __D, __evens);
407   __C = vec_adds(__C, __D);
408   return (__m64)((__v2du)(__C))[0];
409 }
410 
411 extern __inline __m128i
412     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
413     _mm_mulhrs_epi16(__m128i __A, __m128i __B) {
414   __v4si __C = vec_unpackh((__v8hi)__A);
415   __v4si __D = vec_unpackh((__v8hi)__B);
416   __C = vec_mul(__C, __D);
417   __D = vec_unpackl((__v8hi)__A);
418   __v4si __E = vec_unpackl((__v8hi)__B);
419   __D = vec_mul(__D, __E);
420   const __v4su __shift = vec_splats((unsigned int)14);
421   __C = vec_sr(__C, __shift);
422   __D = vec_sr(__D, __shift);
423   const __v4si __ones = vec_splats((signed int)1);
424   __C = vec_add(__C, __ones);
425   __C = vec_sr(__C, (__v4su)__ones);
426   __D = vec_add(__D, __ones);
427   __D = vec_sr(__D, (__v4su)__ones);
428   return (__m128i)vec_pack(__C, __D);
429 }
430 
431 extern __inline __m64
432     __attribute__((__gnu_inline__, __always_inline__, __artificial__))
433     _mm_mulhrs_pi16(__m64 __A, __m64 __B) {
434   __v4si __C = (__v4si)(__v2du){__A, __A};
435   __C = vec_unpackh((__v8hi)__C);
436   __v4si __D = (__v4si)(__v2du){__B, __B};
437   __D = vec_unpackh((__v8hi)__D);
438   __C = vec_mul(__C, __D);
439   const __v4su __shift = vec_splats((unsigned int)14);
440   __C = vec_sr(__C, __shift);
441   const __v4si __ones = vec_splats((signed int)1);
442   __C = vec_add(__C, __ones);
443   __C = vec_sr(__C, (__v4su)__ones);
444   __v8hi __E = vec_pack(__C, __D);
445   return (__m64)((__v2du)(__E))[0];
446 }
447 
448 #else
449 #include_next <tmmintrin.h>
450 #endif /* defined(__powerpc64__) &&                                            \
451         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
452 
453 #endif /* TMMINTRIN_H_ */
454