xref: /freebsd/contrib/llvm-project/clang/lib/Headers/tmmintrin.h (revision 0fca6ea1d4eea4c934cfff25ac9ee8ad6fe95583)
1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __TMMINTRIN_H
11 #define __TMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <pmmintrin.h>
18 
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS                                                     \
21   __attribute__((__always_inline__, __nodebug__,                               \
22                  __target__("ssse3,no-evex512"), __min_vector_width__(64)))
23 #define __DEFAULT_FN_ATTRS_MMX                                                 \
24   __attribute__((__always_inline__, __nodebug__,                               \
25                  __target__("mmx,ssse3,no-evex512"),                           \
26                  __min_vector_width__(64)))
27 
28 /// Computes the absolute value of each of the packed 8-bit signed
29 ///    integers in the source operand and stores the 8-bit unsigned integer
30 ///    results in the destination.
31 ///
32 /// \headerfile <x86intrin.h>
33 ///
34 /// This intrinsic corresponds to the \c PABSB instruction.
35 ///
36 /// \param __a
37 ///    A 64-bit vector of [8 x i8].
38 /// \returns A 64-bit integer vector containing the absolute values of the
39 ///    elements in the operand.
40 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi8(__m64 __a)41 _mm_abs_pi8(__m64 __a)
42 {
43     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
44 }
45 
46 /// Computes the absolute value of each of the packed 8-bit signed
47 ///    integers in the source operand and stores the 8-bit unsigned integer
48 ///    results in the destination.
49 ///
50 /// \headerfile <x86intrin.h>
51 ///
52 /// This intrinsic corresponds to the \c VPABSB instruction.
53 ///
54 /// \param __a
55 ///    A 128-bit vector of [16 x i8].
56 /// \returns A 128-bit integer vector containing the absolute values of the
57 ///    elements in the operand.
58 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi8(__m128i __a)59 _mm_abs_epi8(__m128i __a)
60 {
61     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
62 }
63 
64 /// Computes the absolute value of each of the packed 16-bit signed
65 ///    integers in the source operand and stores the 16-bit unsigned integer
66 ///    results in the destination.
67 ///
68 /// \headerfile <x86intrin.h>
69 ///
70 /// This intrinsic corresponds to the \c PABSW instruction.
71 ///
72 /// \param __a
73 ///    A 64-bit vector of [4 x i16].
74 /// \returns A 64-bit integer vector containing the absolute values of the
75 ///    elements in the operand.
76 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi16(__m64 __a)77 _mm_abs_pi16(__m64 __a)
78 {
79     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
80 }
81 
82 /// Computes the absolute value of each of the packed 16-bit signed
83 ///    integers in the source operand and stores the 16-bit unsigned integer
84 ///    results in the destination.
85 ///
86 /// \headerfile <x86intrin.h>
87 ///
88 /// This intrinsic corresponds to the \c VPABSW instruction.
89 ///
90 /// \param __a
91 ///    A 128-bit vector of [8 x i16].
92 /// \returns A 128-bit integer vector containing the absolute values of the
93 ///    elements in the operand.
94 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi16(__m128i __a)95 _mm_abs_epi16(__m128i __a)
96 {
97     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
98 }
99 
100 /// Computes the absolute value of each of the packed 32-bit signed
101 ///    integers in the source operand and stores the 32-bit unsigned integer
102 ///    results in the destination.
103 ///
104 /// \headerfile <x86intrin.h>
105 ///
106 /// This intrinsic corresponds to the \c PABSD instruction.
107 ///
108 /// \param __a
109 ///    A 64-bit vector of [2 x i32].
110 /// \returns A 64-bit integer vector containing the absolute values of the
111 ///    elements in the operand.
112 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_abs_pi32(__m64 __a)113 _mm_abs_pi32(__m64 __a)
114 {
115     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
116 }
117 
118 /// Computes the absolute value of each of the packed 32-bit signed
119 ///    integers in the source operand and stores the 32-bit unsigned integer
120 ///    results in the destination.
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the \c VPABSD instruction.
125 ///
126 /// \param __a
127 ///    A 128-bit vector of [4 x i32].
128 /// \returns A 128-bit integer vector containing the absolute values of the
129 ///    elements in the operand.
130 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_abs_epi32(__m128i __a)131 _mm_abs_epi32(__m128i __a)
132 {
133     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
134 }
135 
136 /// Concatenates the two 128-bit integer vector operands, and
137 ///    right-shifts the result by the number of bytes specified in the immediate
138 ///    operand.
139 ///
140 /// \headerfile <x86intrin.h>
141 ///
142 /// \code
143 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
144 /// \endcode
145 ///
146 /// This intrinsic corresponds to the \c PALIGNR instruction.
147 ///
148 /// \param a
149 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
150 /// \param b
151 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
152 /// \param n
153 ///    An immediate operand specifying how many bytes to right-shift the result.
154 /// \returns A 128-bit integer vector containing the concatenated right-shifted
155 ///    value.
156 #define _mm_alignr_epi8(a, b, n) \
157   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
158                                       (__v16qi)(__m128i)(b), (n)))
159 
160 /// Concatenates the two 64-bit integer vector operands, and right-shifts
161 ///    the result by the number of bytes specified in the immediate operand.
162 ///
163 /// \headerfile <x86intrin.h>
164 ///
165 /// \code
166 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
167 /// \endcode
168 ///
169 /// This intrinsic corresponds to the \c PALIGNR instruction.
170 ///
171 /// \param a
172 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
173 /// \param b
174 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
175 /// \param n
176 ///    An immediate operand specifying how many bytes to right-shift the result.
177 /// \returns A 64-bit integer vector containing the concatenated right-shifted
178 ///    value.
179 #define _mm_alignr_pi8(a, b, n) \
180   ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
181 
182 /// Horizontally adds the adjacent pairs of values contained in 2 packed
183 ///    128-bit vectors of [8 x i16].
184 ///
185 /// \headerfile <x86intrin.h>
186 ///
187 /// This intrinsic corresponds to the \c VPHADDW instruction.
188 ///
189 /// \param __a
190 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
191 ///    horizontal sums of the values are stored in the lower bits of the
192 ///    destination.
193 /// \param __b
194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
195 ///    horizontal sums of the values are stored in the upper bits of the
196 ///    destination.
197 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
198 ///    both operands.
199 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi16(__m128i __a,__m128i __b)200 _mm_hadd_epi16(__m128i __a, __m128i __b)
201 {
202     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
203 }
204 
205 /// Horizontally adds the adjacent pairs of values contained in 2 packed
206 ///    128-bit vectors of [4 x i32].
207 ///
208 /// \headerfile <x86intrin.h>
209 ///
210 /// This intrinsic corresponds to the \c VPHADDD instruction.
211 ///
212 /// \param __a
213 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
214 ///    horizontal sums of the values are stored in the lower bits of the
215 ///    destination.
216 /// \param __b
217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
218 ///    horizontal sums of the values are stored in the upper bits of the
219 ///    destination.
220 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
221 ///    both operands.
222 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadd_epi32(__m128i __a,__m128i __b)223 _mm_hadd_epi32(__m128i __a, __m128i __b)
224 {
225     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
226 }
227 
228 /// Horizontally adds the adjacent pairs of values contained in 2 packed
229 ///    64-bit vectors of [4 x i16].
230 ///
231 /// \headerfile <x86intrin.h>
232 ///
233 /// This intrinsic corresponds to the \c PHADDW instruction.
234 ///
235 /// \param __a
236 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
237 ///    horizontal sums of the values are stored in the lower bits of the
238 ///    destination.
239 /// \param __b
240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
241 ///    horizontal sums of the values are stored in the upper bits of the
242 ///    destination.
243 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
244 ///    operands.
245 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadd_pi16(__m64 __a,__m64 __b)246 _mm_hadd_pi16(__m64 __a, __m64 __b)
247 {
248     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
249 }
250 
251 /// Horizontally adds the adjacent pairs of values contained in 2 packed
252 ///    64-bit vectors of [2 x i32].
253 ///
254 /// \headerfile <x86intrin.h>
255 ///
256 /// This intrinsic corresponds to the \c PHADDD instruction.
257 ///
258 /// \param __a
259 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
260 ///    horizontal sums of the values are stored in the lower bits of the
261 ///    destination.
262 /// \param __b
263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
264 ///    horizontal sums of the values are stored in the upper bits of the
265 ///    destination.
266 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
267 ///    operands.
268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadd_pi32(__m64 __a,__m64 __b)269 _mm_hadd_pi32(__m64 __a, __m64 __b)
270 {
271     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
272 }
273 
274 /// Horizontally adds, with saturation, the adjacent pairs of values contained
275 ///    in two packed 128-bit vectors of [8 x i16].
276 ///
277 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
278 ///    less than 0x8000 are saturated to 0x8000.
279 ///
280 /// \headerfile <x86intrin.h>
281 ///
282 /// This intrinsic corresponds to the \c VPHADDSW instruction.
283 ///
284 /// \param __a
285 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
286 ///    horizontal sums of the values are stored in the lower bits of the
287 ///    destination.
288 /// \param __b
289 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
290 ///    horizontal sums of the values are stored in the upper bits of the
291 ///    destination.
292 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
293 ///    sums of both operands.
294 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hadds_epi16(__m128i __a,__m128i __b)295 _mm_hadds_epi16(__m128i __a, __m128i __b)
296 {
297     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
298 }
299 
300 /// Horizontally adds, with saturation, the adjacent pairs of values contained
301 ///    in two packed 64-bit vectors of [4 x i16].
302 ///
303 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
304 ///    less than 0x8000 are saturated to 0x8000.
305 ///
306 /// \headerfile <x86intrin.h>
307 ///
308 /// This intrinsic corresponds to the \c PHADDSW instruction.
309 ///
310 /// \param __a
311 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
312 ///    horizontal sums of the values are stored in the lower bits of the
313 ///    destination.
314 /// \param __b
315 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
316 ///    horizontal sums of the values are stored in the upper bits of the
317 ///    destination.
318 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
319 ///    sums of both operands.
320 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hadds_pi16(__m64 __a,__m64 __b)321 _mm_hadds_pi16(__m64 __a, __m64 __b)
322 {
323     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
324 }
325 
326 /// Horizontally subtracts the adjacent pairs of values contained in 2
327 ///    packed 128-bit vectors of [8 x i16].
328 ///
329 /// \headerfile <x86intrin.h>
330 ///
331 /// This intrinsic corresponds to the \c VPHSUBW instruction.
332 ///
333 /// \param __a
334 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
335 ///    horizontal differences between the values are stored in the lower bits of
336 ///    the destination.
337 /// \param __b
338 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
339 ///    horizontal differences between the values are stored in the upper bits of
340 ///    the destination.
341 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
342 ///    of both operands.
343 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi16(__m128i __a,__m128i __b)344 _mm_hsub_epi16(__m128i __a, __m128i __b)
345 {
346     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
347 }
348 
349 /// Horizontally subtracts the adjacent pairs of values contained in 2
350 ///    packed 128-bit vectors of [4 x i32].
351 ///
352 /// \headerfile <x86intrin.h>
353 ///
354 /// This intrinsic corresponds to the \c VPHSUBD instruction.
355 ///
356 /// \param __a
357 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
358 ///    horizontal differences between the values are stored in the lower bits of
359 ///    the destination.
360 /// \param __b
361 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
362 ///    horizontal differences between the values are stored in the upper bits of
363 ///    the destination.
364 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
365 ///    of both operands.
366 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsub_epi32(__m128i __a,__m128i __b)367 _mm_hsub_epi32(__m128i __a, __m128i __b)
368 {
369     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
370 }
371 
372 /// Horizontally subtracts the adjacent pairs of values contained in 2
373 ///    packed 64-bit vectors of [4 x i16].
374 ///
375 /// \headerfile <x86intrin.h>
376 ///
377 /// This intrinsic corresponds to the \c PHSUBW instruction.
378 ///
379 /// \param __a
380 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
381 ///    horizontal differences between the values are stored in the lower bits of
382 ///    the destination.
383 /// \param __b
384 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
385 ///    horizontal differences between the values are stored in the upper bits of
386 ///    the destination.
387 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
388 ///    of both operands.
389 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsub_pi16(__m64 __a,__m64 __b)390 _mm_hsub_pi16(__m64 __a, __m64 __b)
391 {
392     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
393 }
394 
395 /// Horizontally subtracts the adjacent pairs of values contained in 2
396 ///    packed 64-bit vectors of [2 x i32].
397 ///
398 /// \headerfile <x86intrin.h>
399 ///
400 /// This intrinsic corresponds to the \c PHSUBD instruction.
401 ///
402 /// \param __a
403 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
404 ///    horizontal differences between the values are stored in the lower bits of
405 ///    the destination.
406 /// \param __b
407 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
408 ///    horizontal differences between the values are stored in the upper bits of
409 ///    the destination.
410 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
411 ///    of both operands.
412 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsub_pi32(__m64 __a,__m64 __b)413 _mm_hsub_pi32(__m64 __a, __m64 __b)
414 {
415     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
416 }
417 
418 /// Horizontally subtracts, with saturation, the adjacent pairs of values
419 ///    contained in two packed 128-bit vectors of [8 x i16].
420 ///
421 ///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
422 ///    Negative differences less than 0x8000 are saturated to 0x8000.
423 ///
424 /// \headerfile <x86intrin.h>
425 ///
426 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
427 ///
428 /// \param __a
429 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
430 ///    horizontal differences between the values are stored in the lower bits of
431 ///    the destination.
432 /// \param __b
433 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
434 ///    horizontal differences between the values are stored in the upper bits of
435 ///    the destination.
436 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
437 ///    differences of both operands.
438 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_hsubs_epi16(__m128i __a,__m128i __b)439 _mm_hsubs_epi16(__m128i __a, __m128i __b)
440 {
441     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
442 }
443 
444 /// Horizontally subtracts, with saturation, the adjacent pairs of values
445 ///    contained in two packed 64-bit vectors of [4 x i16].
446 ///
447 ///    Positive differences greater than 0x7FFF are saturated to 0x7FFF.
448 ///    Negative differences less than 0x8000 are saturated to 0x8000.
449 ///
450 /// \headerfile <x86intrin.h>
451 ///
452 /// This intrinsic corresponds to the \c PHSUBSW instruction.
453 ///
454 /// \param __a
455 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
456 ///    horizontal differences between the values are stored in the lower bits of
457 ///    the destination.
458 /// \param __b
459 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
460 ///    horizontal differences between the values are stored in the upper bits of
461 ///    the destination.
462 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
463 ///    differences of both operands.
464 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_hsubs_pi16(__m64 __a,__m64 __b)465 _mm_hsubs_pi16(__m64 __a, __m64 __b)
466 {
467     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
468 }
469 
470 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
471 ///    values contained in the first source operand and packed 8-bit signed
472 ///    integer values contained in the second source operand, adds pairs of
473 ///    contiguous products with signed saturation, and writes the 16-bit sums to
474 ///    the corresponding bits in the destination.
475 ///
476 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
477 ///    both operands are multiplied, and the sum of both results is written to
478 ///    bits [15:0] of the destination.
479 ///
480 /// \headerfile <x86intrin.h>
481 ///
482 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
483 ///
484 /// \param __a
485 ///    A 128-bit integer vector containing the first source operand.
486 /// \param __b
487 ///    A 128-bit integer vector containing the second source operand.
488 /// \returns A 128-bit integer vector containing the sums of products of both
489 ///    operands: \n
490 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
491 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
492 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
493 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
494 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
495 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
496 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
497 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
498 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_maddubs_epi16(__m128i __a,__m128i __b)499 _mm_maddubs_epi16(__m128i __a, __m128i __b)
500 {
501     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
502 }
503 
504 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
505 ///    values contained in the first source operand and packed 8-bit signed
506 ///    integer values contained in the second source operand, adds pairs of
507 ///    contiguous products with signed saturation, and writes the 16-bit sums to
508 ///    the corresponding bits in the destination.
509 ///
510 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
511 ///    both operands are multiplied, and the sum of both results is written to
512 ///    bits [15:0] of the destination.
513 ///
514 /// \headerfile <x86intrin.h>
515 ///
516 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
517 ///
518 /// \param __a
519 ///    A 64-bit integer vector containing the first source operand.
520 /// \param __b
521 ///    A 64-bit integer vector containing the second source operand.
522 /// \returns A 64-bit integer vector containing the sums of products of both
523 ///    operands: \n
524 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
525 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
526 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
527 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
528 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_maddubs_pi16(__m64 __a,__m64 __b)529 _mm_maddubs_pi16(__m64 __a, __m64 __b)
530 {
531     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
532 }
533 
534 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
535 ///    products to the 18 most significant bits by right-shifting, rounds the
536 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
537 ///
538 /// \headerfile <x86intrin.h>
539 ///
540 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
541 ///
542 /// \param __a
543 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
544 /// \param __b
545 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
546 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
547 ///    products of both operands.
548 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_mulhrs_epi16(__m128i __a,__m128i __b)549 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
550 {
551     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
552 }
553 
554 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
555 ///    products to the 18 most significant bits by right-shifting, rounds the
556 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
557 ///
558 /// \headerfile <x86intrin.h>
559 ///
560 /// This intrinsic corresponds to the \c PMULHRSW instruction.
561 ///
562 /// \param __a
563 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
564 /// \param __b
565 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
566 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
567 ///    products of both operands.
568 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mulhrs_pi16(__m64 __a,__m64 __b)569 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
570 {
571     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
572 }
573 
574 /// Copies the 8-bit integers from a 128-bit integer vector to the
575 ///    destination or clears 8-bit values in the destination, as specified by
576 ///    the second source operand.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic corresponds to the \c VPSHUFB instruction.
581 ///
582 /// \param __a
583 ///    A 128-bit integer vector containing the values to be copied.
584 /// \param __b
585 ///    A 128-bit integer vector containing control bytes corresponding to
586 ///    positions in the destination:
587 ///    Bit 7: \n
588 ///    1: Clear the corresponding byte in the destination. \n
589 ///    0: Copy the selected source byte to the corresponding byte in the
590 ///    destination. \n
591 ///    Bits [6:4] Reserved.  \n
592 ///    Bits [3:0] select the source byte to be copied.
593 /// \returns A 128-bit integer vector containing the copied or cleared values.
594 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a,__m128i __b)595 _mm_shuffle_epi8(__m128i __a, __m128i __b)
596 {
597     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
598 }
599 
600 /// Copies the 8-bit integers from a 64-bit integer vector to the
601 ///    destination or clears 8-bit values in the destination, as specified by
602 ///    the second source operand.
603 ///
604 /// \headerfile <x86intrin.h>
605 ///
606 /// This intrinsic corresponds to the \c PSHUFB instruction.
607 ///
608 /// \param __a
609 ///    A 64-bit integer vector containing the values to be copied.
610 /// \param __b
611 ///    A 64-bit integer vector containing control bytes corresponding to
612 ///    positions in the destination:
613 ///    Bit 7: \n
614 ///    1: Clear the corresponding byte in the destination. \n
615 ///    0: Copy the selected source byte to the corresponding byte in the
616 ///    destination. \n
617 ///    Bits [3:0] select the source byte to be copied.
618 /// \returns A 64-bit integer vector containing the copied or cleared values.
619 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_shuffle_pi8(__m64 __a,__m64 __b)620 _mm_shuffle_pi8(__m64 __a, __m64 __b)
621 {
622     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
623 }
624 
625 /// For each 8-bit integer in the first source operand, perform one of
626 ///    the following actions as specified by the second source operand.
627 ///
628 ///    If the byte in the second source is negative, calculate the two's
629 ///    complement of the corresponding byte in the first source, and write that
630 ///    value to the destination. If the byte in the second source is positive,
631 ///    copy the corresponding byte from the first source to the destination. If
632 ///    the byte in the second source is zero, clear the corresponding byte in
633 ///    the destination.
634 ///
635 /// \headerfile <x86intrin.h>
636 ///
637 /// This intrinsic corresponds to the \c VPSIGNB instruction.
638 ///
639 /// \param __a
640 ///    A 128-bit integer vector containing the values to be copied.
641 /// \param __b
642 ///    A 128-bit integer vector containing control bytes corresponding to
643 ///    positions in the destination.
644 /// \returns A 128-bit integer vector containing the resultant values.
645 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi8(__m128i __a,__m128i __b)646 _mm_sign_epi8(__m128i __a, __m128i __b)
647 {
648     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
649 }
650 
651 /// For each 16-bit integer in the first source operand, perform one of
652 ///    the following actions as specified by the second source operand.
653 ///
654 ///    If the word in the second source is negative, calculate the two's
655 ///    complement of the corresponding word in the first source, and write that
656 ///    value to the destination. If the word in the second source is positive,
657 ///    copy the corresponding word from the first source to the destination. If
658 ///    the word in the second source is zero, clear the corresponding word in
659 ///    the destination.
660 ///
661 /// \headerfile <x86intrin.h>
662 ///
663 /// This intrinsic corresponds to the \c VPSIGNW instruction.
664 ///
665 /// \param __a
666 ///    A 128-bit integer vector containing the values to be copied.
667 /// \param __b
668 ///    A 128-bit integer vector containing control words corresponding to
669 ///    positions in the destination.
670 /// \returns A 128-bit integer vector containing the resultant values.
671 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi16(__m128i __a,__m128i __b)672 _mm_sign_epi16(__m128i __a, __m128i __b)
673 {
674     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
675 }
676 
677 /// For each 32-bit integer in the first source operand, perform one of
678 ///    the following actions as specified by the second source operand.
679 ///
680 ///    If the doubleword in the second source is negative, calculate the two's
681 ///    complement of the corresponding word in the first source, and write that
682 ///    value to the destination. If the doubleword in the second source is
683 ///    positive, copy the corresponding word from the first source to the
684 ///    destination. If the doubleword in the second source is zero, clear the
685 ///    corresponding word in the destination.
686 ///
687 /// \headerfile <x86intrin.h>
688 ///
689 /// This intrinsic corresponds to the \c VPSIGND instruction.
690 ///
691 /// \param __a
692 ///    A 128-bit integer vector containing the values to be copied.
693 /// \param __b
694 ///    A 128-bit integer vector containing control doublewords corresponding to
695 ///    positions in the destination.
696 /// \returns A 128-bit integer vector containing the resultant values.
697 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_sign_epi32(__m128i __a,__m128i __b)698 _mm_sign_epi32(__m128i __a, __m128i __b)
699 {
700     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
701 }
702 
703 /// For each 8-bit integer in the first source operand, perform one of
704 ///    the following actions as specified by the second source operand.
705 ///
706 ///    If the byte in the second source is negative, calculate the two's
707 ///    complement of the corresponding byte in the first source, and write that
708 ///    value to the destination. If the byte in the second source is positive,
709 ///    copy the corresponding byte from the first source to the destination. If
710 ///    the byte in the second source is zero, clear the corresponding byte in
711 ///    the destination.
712 ///
713 /// \headerfile <x86intrin.h>
714 ///
715 /// This intrinsic corresponds to the \c PSIGNB instruction.
716 ///
717 /// \param __a
718 ///    A 64-bit integer vector containing the values to be copied.
719 /// \param __b
720 ///    A 64-bit integer vector containing control bytes corresponding to
721 ///    positions in the destination.
722 /// \returns A 64-bit integer vector containing the resultant values.
723 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi8(__m64 __a,__m64 __b)724 _mm_sign_pi8(__m64 __a, __m64 __b)
725 {
726     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
727 }
728 
729 /// For each 16-bit integer in the first source operand, perform one of
730 ///    the following actions as specified by the second source operand.
731 ///
732 ///    If the word in the second source is negative, calculate the two's
733 ///    complement of the corresponding word in the first source, and write that
734 ///    value to the destination. If the word in the second source is positive,
735 ///    copy the corresponding word from the first source to the destination. If
736 ///    the word in the second source is zero, clear the corresponding word in
737 ///    the destination.
738 ///
739 /// \headerfile <x86intrin.h>
740 ///
741 /// This intrinsic corresponds to the \c PSIGNW instruction.
742 ///
743 /// \param __a
744 ///    A 64-bit integer vector containing the values to be copied.
745 /// \param __b
746 ///    A 64-bit integer vector containing control words corresponding to
747 ///    positions in the destination.
748 /// \returns A 64-bit integer vector containing the resultant values.
749 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi16(__m64 __a,__m64 __b)750 _mm_sign_pi16(__m64 __a, __m64 __b)
751 {
752     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
753 }
754 
755 /// For each 32-bit integer in the first source operand, perform one of
756 ///    the following actions as specified by the second source operand.
757 ///
758 ///    If the doubleword in the second source is negative, calculate the two's
759 ///    complement of the corresponding doubleword in the first source, and
760 ///    write that value to the destination. If the doubleword in the second
761 ///    source is positive, copy the corresponding doubleword from the first
762 ///    source to the destination. If the doubleword in the second source is
763 ///    zero, clear the corresponding doubleword in the destination.
764 ///
765 /// \headerfile <x86intrin.h>
766 ///
767 /// This intrinsic corresponds to the \c PSIGND instruction.
768 ///
769 /// \param __a
770 ///    A 64-bit integer vector containing the values to be copied.
771 /// \param __b
772 ///    A 64-bit integer vector containing two control doublewords corresponding
773 ///    to positions in the destination.
774 /// \returns A 64-bit integer vector containing the resultant values.
775 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sign_pi32(__m64 __a,__m64 __b)776 _mm_sign_pi32(__m64 __a, __m64 __b)
777 {
778     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
779 }
780 
781 #undef __DEFAULT_FN_ATTRS
782 #undef __DEFAULT_FN_ATTRS_MMX
783 
784 #endif /* __TMMINTRIN_H */
785