xref: /freebsd/contrib/llvm-project/clang/lib/Headers/tmmintrin.h (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __TMMINTRIN_H
11 #define __TMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <pmmintrin.h>
18 
19 /* Define the default attributes for the functions in this file. */
20 #define __DEFAULT_FN_ATTRS                                                     \
21   __attribute__((__always_inline__, __nodebug__,                               \
22                  __target__("ssse3,no-evex512"), __min_vector_width__(64)))
23 #define __DEFAULT_FN_ATTRS_MMX                                                 \
24   __attribute__((__always_inline__, __nodebug__,                               \
25                  __target__("mmx,ssse3,no-evex512"),                           \
26                  __min_vector_width__(64)))
27 
28 /// Computes the absolute value of each of the packed 8-bit signed
29 ///    integers in the source operand and stores the 8-bit unsigned integer
30 ///    results in the destination.
31 ///
32 /// \headerfile <x86intrin.h>
33 ///
34 /// This intrinsic corresponds to the \c PABSB instruction.
35 ///
36 /// \param __a
37 ///    A 64-bit vector of [8 x i8].
38 /// \returns A 64-bit integer vector containing the absolute values of the
39 ///    elements in the operand.
40 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
41 _mm_abs_pi8(__m64 __a)
42 {
43     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
44 }
45 
46 /// Computes the absolute value of each of the packed 8-bit signed
47 ///    integers in the source operand and stores the 8-bit unsigned integer
48 ///    results in the destination.
49 ///
50 /// \headerfile <x86intrin.h>
51 ///
52 /// This intrinsic corresponds to the \c VPABSB instruction.
53 ///
54 /// \param __a
55 ///    A 128-bit vector of [16 x i8].
56 /// \returns A 128-bit integer vector containing the absolute values of the
57 ///    elements in the operand.
58 static __inline__ __m128i __DEFAULT_FN_ATTRS
59 _mm_abs_epi8(__m128i __a)
60 {
61     return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
62 }
63 
64 /// Computes the absolute value of each of the packed 16-bit signed
65 ///    integers in the source operand and stores the 16-bit unsigned integer
66 ///    results in the destination.
67 ///
68 /// \headerfile <x86intrin.h>
69 ///
70 /// This intrinsic corresponds to the \c PABSW instruction.
71 ///
72 /// \param __a
73 ///    A 64-bit vector of [4 x i16].
74 /// \returns A 64-bit integer vector containing the absolute values of the
75 ///    elements in the operand.
76 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
77 _mm_abs_pi16(__m64 __a)
78 {
79     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
80 }
81 
82 /// Computes the absolute value of each of the packed 16-bit signed
83 ///    integers in the source operand and stores the 16-bit unsigned integer
84 ///    results in the destination.
85 ///
86 /// \headerfile <x86intrin.h>
87 ///
88 /// This intrinsic corresponds to the \c VPABSW instruction.
89 ///
90 /// \param __a
91 ///    A 128-bit vector of [8 x i16].
92 /// \returns A 128-bit integer vector containing the absolute values of the
93 ///    elements in the operand.
94 static __inline__ __m128i __DEFAULT_FN_ATTRS
95 _mm_abs_epi16(__m128i __a)
96 {
97     return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
98 }
99 
100 /// Computes the absolute value of each of the packed 32-bit signed
101 ///    integers in the source operand and stores the 32-bit unsigned integer
102 ///    results in the destination.
103 ///
104 /// \headerfile <x86intrin.h>
105 ///
106 /// This intrinsic corresponds to the \c PABSD instruction.
107 ///
108 /// \param __a
109 ///    A 64-bit vector of [2 x i32].
110 /// \returns A 64-bit integer vector containing the absolute values of the
111 ///    elements in the operand.
112 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
113 _mm_abs_pi32(__m64 __a)
114 {
115     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
116 }
117 
118 /// Computes the absolute value of each of the packed 32-bit signed
119 ///    integers in the source operand and stores the 32-bit unsigned integer
120 ///    results in the destination.
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the \c VPABSD instruction.
125 ///
126 /// \param __a
127 ///    A 128-bit vector of [4 x i32].
128 /// \returns A 128-bit integer vector containing the absolute values of the
129 ///    elements in the operand.
130 static __inline__ __m128i __DEFAULT_FN_ATTRS
131 _mm_abs_epi32(__m128i __a)
132 {
133     return (__m128i)__builtin_elementwise_abs((__v4si)__a);
134 }
135 
136 /// Concatenates the two 128-bit integer vector operands, and
137 ///    right-shifts the result by the number of bytes specified in the immediate
138 ///    operand.
139 ///
140 /// \headerfile <x86intrin.h>
141 ///
142 /// \code
143 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
144 /// \endcode
145 ///
146 /// This intrinsic corresponds to the \c PALIGNR instruction.
147 ///
148 /// \param a
149 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
150 /// \param b
151 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
152 /// \param n
153 ///    An immediate operand specifying how many bytes to right-shift the result.
154 /// \returns A 128-bit integer vector containing the concatenated right-shifted
155 ///    value.
156 #define _mm_alignr_epi8(a, b, n) \
157   ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
158                                       (__v16qi)(__m128i)(b), (n)))
159 
160 /// Concatenates the two 64-bit integer vector operands, and right-shifts
161 ///    the result by the number of bytes specified in the immediate operand.
162 ///
163 /// \headerfile <x86intrin.h>
164 ///
165 /// \code
166 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
167 /// \endcode
168 ///
169 /// This intrinsic corresponds to the \c PALIGNR instruction.
170 ///
171 /// \param a
172 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
173 /// \param b
174 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
175 /// \param n
176 ///    An immediate operand specifying how many bytes to right-shift the result.
177 /// \returns A 64-bit integer vector containing the concatenated right-shifted
178 ///    value.
179 #define _mm_alignr_pi8(a, b, n) \
180   ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
181 
182 /// Horizontally adds the adjacent pairs of values contained in 2 packed
183 ///    128-bit vectors of [8 x i16].
184 ///
185 /// \headerfile <x86intrin.h>
186 ///
187 /// This intrinsic corresponds to the \c VPHADDW instruction.
188 ///
189 /// \param __a
190 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
191 ///    horizontal sums of the values are stored in the lower bits of the
192 ///    destination.
193 /// \param __b
194 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
195 ///    horizontal sums of the values are stored in the upper bits of the
196 ///    destination.
197 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
198 ///    both operands.
199 static __inline__ __m128i __DEFAULT_FN_ATTRS
200 _mm_hadd_epi16(__m128i __a, __m128i __b)
201 {
202     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
203 }
204 
205 /// Horizontally adds the adjacent pairs of values contained in 2 packed
206 ///    128-bit vectors of [4 x i32].
207 ///
208 /// \headerfile <x86intrin.h>
209 ///
210 /// This intrinsic corresponds to the \c VPHADDD instruction.
211 ///
212 /// \param __a
213 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
214 ///    horizontal sums of the values are stored in the lower bits of the
215 ///    destination.
216 /// \param __b
217 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
218 ///    horizontal sums of the values are stored in the upper bits of the
219 ///    destination.
220 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
221 ///    both operands.
222 static __inline__ __m128i __DEFAULT_FN_ATTRS
223 _mm_hadd_epi32(__m128i __a, __m128i __b)
224 {
225     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
226 }
227 
228 /// Horizontally adds the adjacent pairs of values contained in 2 packed
229 ///    64-bit vectors of [4 x i16].
230 ///
231 /// \headerfile <x86intrin.h>
232 ///
233 /// This intrinsic corresponds to the \c PHADDW instruction.
234 ///
235 /// \param __a
236 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
237 ///    horizontal sums of the values are stored in the lower bits of the
238 ///    destination.
239 /// \param __b
240 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
241 ///    horizontal sums of the values are stored in the upper bits of the
242 ///    destination.
243 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
244 ///    operands.
245 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
246 _mm_hadd_pi16(__m64 __a, __m64 __b)
247 {
248     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
249 }
250 
251 /// Horizontally adds the adjacent pairs of values contained in 2 packed
252 ///    64-bit vectors of [2 x i32].
253 ///
254 /// \headerfile <x86intrin.h>
255 ///
256 /// This intrinsic corresponds to the \c PHADDD instruction.
257 ///
258 /// \param __a
259 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
260 ///    horizontal sums of the values are stored in the lower bits of the
261 ///    destination.
262 /// \param __b
263 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
264 ///    horizontal sums of the values are stored in the upper bits of the
265 ///    destination.
266 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
267 ///    operands.
268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
269 _mm_hadd_pi32(__m64 __a, __m64 __b)
270 {
271     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
272 }
273 
274 /// Horizontally adds the adjacent pairs of values contained in 2 packed
275 ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
276 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
277 ///    0x8000.
278 ///
279 /// \headerfile <x86intrin.h>
280 ///
281 /// This intrinsic corresponds to the \c VPHADDSW instruction.
282 ///
283 /// \param __a
284 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
285 ///    horizontal sums of the values are stored in the lower bits of the
286 ///    destination.
287 /// \param __b
288 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
289 ///    horizontal sums of the values are stored in the upper bits of the
290 ///    destination.
291 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
292 ///    sums of both operands.
293 static __inline__ __m128i __DEFAULT_FN_ATTRS
294 _mm_hadds_epi16(__m128i __a, __m128i __b)
295 {
296     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
297 }
298 
299 /// Horizontally adds the adjacent pairs of values contained in 2 packed
300 ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
301 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
302 ///    0x8000.
303 ///
304 /// \headerfile <x86intrin.h>
305 ///
306 /// This intrinsic corresponds to the \c PHADDSW instruction.
307 ///
308 /// \param __a
309 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
310 ///    horizontal sums of the values are stored in the lower bits of the
311 ///    destination.
312 /// \param __b
313 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
314 ///    horizontal sums of the values are stored in the upper bits of the
315 ///    destination.
316 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
317 ///    sums of both operands.
318 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
319 _mm_hadds_pi16(__m64 __a, __m64 __b)
320 {
321     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
322 }
323 
324 /// Horizontally subtracts the adjacent pairs of values contained in 2
325 ///    packed 128-bit vectors of [8 x i16].
326 ///
327 /// \headerfile <x86intrin.h>
328 ///
329 /// This intrinsic corresponds to the \c VPHSUBW instruction.
330 ///
331 /// \param __a
332 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
333 ///    horizontal differences between the values are stored in the lower bits of
334 ///    the destination.
335 /// \param __b
336 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
337 ///    horizontal differences between the values are stored in the upper bits of
338 ///    the destination.
339 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
340 ///    of both operands.
341 static __inline__ __m128i __DEFAULT_FN_ATTRS
342 _mm_hsub_epi16(__m128i __a, __m128i __b)
343 {
344     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
345 }
346 
347 /// Horizontally subtracts the adjacent pairs of values contained in 2
348 ///    packed 128-bit vectors of [4 x i32].
349 ///
350 /// \headerfile <x86intrin.h>
351 ///
352 /// This intrinsic corresponds to the \c VPHSUBD instruction.
353 ///
354 /// \param __a
355 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
356 ///    horizontal differences between the values are stored in the lower bits of
357 ///    the destination.
358 /// \param __b
359 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
360 ///    horizontal differences between the values are stored in the upper bits of
361 ///    the destination.
362 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
363 ///    of both operands.
364 static __inline__ __m128i __DEFAULT_FN_ATTRS
365 _mm_hsub_epi32(__m128i __a, __m128i __b)
366 {
367     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
368 }
369 
370 /// Horizontally subtracts the adjacent pairs of values contained in 2
371 ///    packed 64-bit vectors of [4 x i16].
372 ///
373 /// \headerfile <x86intrin.h>
374 ///
375 /// This intrinsic corresponds to the \c PHSUBW instruction.
376 ///
377 /// \param __a
378 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
379 ///    horizontal differences between the values are stored in the lower bits of
380 ///    the destination.
381 /// \param __b
382 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
383 ///    horizontal differences between the values are stored in the upper bits of
384 ///    the destination.
385 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
386 ///    of both operands.
387 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
388 _mm_hsub_pi16(__m64 __a, __m64 __b)
389 {
390     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
391 }
392 
393 /// Horizontally subtracts the adjacent pairs of values contained in 2
394 ///    packed 64-bit vectors of [2 x i32].
395 ///
396 /// \headerfile <x86intrin.h>
397 ///
398 /// This intrinsic corresponds to the \c PHSUBD instruction.
399 ///
400 /// \param __a
401 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
402 ///    horizontal differences between the values are stored in the lower bits of
403 ///    the destination.
404 /// \param __b
405 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
406 ///    horizontal differences between the values are stored in the upper bits of
407 ///    the destination.
408 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
409 ///    of both operands.
410 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
411 _mm_hsub_pi32(__m64 __a, __m64 __b)
412 {
413     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
414 }
415 
416 /// Horizontally subtracts the adjacent pairs of values contained in 2
417 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
418 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
419 ///    saturated to 0x8000.
420 ///
421 /// \headerfile <x86intrin.h>
422 ///
423 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
424 ///
425 /// \param __a
426 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
427 ///    horizontal differences between the values are stored in the lower bits of
428 ///    the destination.
429 /// \param __b
430 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
431 ///    horizontal differences between the values are stored in the upper bits of
432 ///    the destination.
433 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
434 ///    differences of both operands.
435 static __inline__ __m128i __DEFAULT_FN_ATTRS
436 _mm_hsubs_epi16(__m128i __a, __m128i __b)
437 {
438     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
439 }
440 
441 /// Horizontally subtracts the adjacent pairs of values contained in 2
442 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
443 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
444 ///    saturated to 0x8000.
445 ///
446 /// \headerfile <x86intrin.h>
447 ///
448 /// This intrinsic corresponds to the \c PHSUBSW instruction.
449 ///
450 /// \param __a
451 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
452 ///    horizontal differences between the values are stored in the lower bits of
453 ///    the destination.
454 /// \param __b
455 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
456 ///    horizontal differences between the values are stored in the upper bits of
457 ///    the destination.
458 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
459 ///    differences of both operands.
460 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
461 _mm_hsubs_pi16(__m64 __a, __m64 __b)
462 {
463     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
464 }
465 
466 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
467 ///    values contained in the first source operand and packed 8-bit signed
468 ///    integer values contained in the second source operand, adds pairs of
469 ///    contiguous products with signed saturation, and writes the 16-bit sums to
470 ///    the corresponding bits in the destination.
471 ///
472 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
473 ///    both operands are multiplied, and the sum of both results is written to
474 ///    bits [15:0] of the destination.
475 ///
476 /// \headerfile <x86intrin.h>
477 ///
478 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
479 ///
480 /// \param __a
481 ///    A 128-bit integer vector containing the first source operand.
482 /// \param __b
483 ///    A 128-bit integer vector containing the second source operand.
484 /// \returns A 128-bit integer vector containing the sums of products of both
485 ///    operands: \n
486 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
487 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
488 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
489 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
490 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
491 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
492 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
493 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
494 static __inline__ __m128i __DEFAULT_FN_ATTRS
495 _mm_maddubs_epi16(__m128i __a, __m128i __b)
496 {
497     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
498 }
499 
500 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
501 ///    values contained in the first source operand and packed 8-bit signed
502 ///    integer values contained in the second source operand, adds pairs of
503 ///    contiguous products with signed saturation, and writes the 16-bit sums to
504 ///    the corresponding bits in the destination.
505 ///
506 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
507 ///    both operands are multiplied, and the sum of both results is written to
508 ///    bits [15:0] of the destination.
509 ///
510 /// \headerfile <x86intrin.h>
511 ///
512 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
513 ///
514 /// \param __a
515 ///    A 64-bit integer vector containing the first source operand.
516 /// \param __b
517 ///    A 64-bit integer vector containing the second source operand.
518 /// \returns A 64-bit integer vector containing the sums of products of both
519 ///    operands: \n
520 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
521 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
522 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
523 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
524 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
525 _mm_maddubs_pi16(__m64 __a, __m64 __b)
526 {
527     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
528 }
529 
530 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
531 ///    products to the 18 most significant bits by right-shifting, rounds the
532 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
533 ///
534 /// \headerfile <x86intrin.h>
535 ///
536 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
537 ///
538 /// \param __a
539 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
540 /// \param __b
541 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
542 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
543 ///    products of both operands.
544 static __inline__ __m128i __DEFAULT_FN_ATTRS
545 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
546 {
547     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
548 }
549 
550 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
551 ///    products to the 18 most significant bits by right-shifting, rounds the
552 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
553 ///
554 /// \headerfile <x86intrin.h>
555 ///
556 /// This intrinsic corresponds to the \c PMULHRSW instruction.
557 ///
558 /// \param __a
559 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
560 /// \param __b
561 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
562 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
563 ///    products of both operands.
564 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
565 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
566 {
567     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
568 }
569 
570 /// Copies the 8-bit integers from a 128-bit integer vector to the
571 ///    destination or clears 8-bit values in the destination, as specified by
572 ///    the second source operand.
573 ///
574 /// \headerfile <x86intrin.h>
575 ///
576 /// This intrinsic corresponds to the \c VPSHUFB instruction.
577 ///
578 /// \param __a
579 ///    A 128-bit integer vector containing the values to be copied.
580 /// \param __b
581 ///    A 128-bit integer vector containing control bytes corresponding to
582 ///    positions in the destination:
583 ///    Bit 7: \n
584 ///    1: Clear the corresponding byte in the destination. \n
585 ///    0: Copy the selected source byte to the corresponding byte in the
586 ///    destination. \n
587 ///    Bits [6:4] Reserved.  \n
588 ///    Bits [3:0] select the source byte to be copied.
589 /// \returns A 128-bit integer vector containing the copied or cleared values.
590 static __inline__ __m128i __DEFAULT_FN_ATTRS
591 _mm_shuffle_epi8(__m128i __a, __m128i __b)
592 {
593     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
594 }
595 
596 /// Copies the 8-bit integers from a 64-bit integer vector to the
597 ///    destination or clears 8-bit values in the destination, as specified by
598 ///    the second source operand.
599 ///
600 /// \headerfile <x86intrin.h>
601 ///
602 /// This intrinsic corresponds to the \c PSHUFB instruction.
603 ///
604 /// \param __a
605 ///    A 64-bit integer vector containing the values to be copied.
606 /// \param __b
607 ///    A 64-bit integer vector containing control bytes corresponding to
608 ///    positions in the destination:
609 ///    Bit 7: \n
610 ///    1: Clear the corresponding byte in the destination. \n
611 ///    0: Copy the selected source byte to the corresponding byte in the
612 ///    destination. \n
613 ///    Bits [3:0] select the source byte to be copied.
614 /// \returns A 64-bit integer vector containing the copied or cleared values.
615 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
616 _mm_shuffle_pi8(__m64 __a, __m64 __b)
617 {
618     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
619 }
620 
621 /// For each 8-bit integer in the first source operand, perform one of
622 ///    the following actions as specified by the second source operand.
623 ///
624 ///    If the byte in the second source is negative, calculate the two's
625 ///    complement of the corresponding byte in the first source, and write that
626 ///    value to the destination. If the byte in the second source is positive,
627 ///    copy the corresponding byte from the first source to the destination. If
628 ///    the byte in the second source is zero, clear the corresponding byte in
629 ///    the destination.
630 ///
631 /// \headerfile <x86intrin.h>
632 ///
633 /// This intrinsic corresponds to the \c VPSIGNB instruction.
634 ///
635 /// \param __a
636 ///    A 128-bit integer vector containing the values to be copied.
637 /// \param __b
638 ///    A 128-bit integer vector containing control bytes corresponding to
639 ///    positions in the destination.
640 /// \returns A 128-bit integer vector containing the resultant values.
641 static __inline__ __m128i __DEFAULT_FN_ATTRS
642 _mm_sign_epi8(__m128i __a, __m128i __b)
643 {
644     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
645 }
646 
647 /// For each 16-bit integer in the first source operand, perform one of
648 ///    the following actions as specified by the second source operand.
649 ///
650 ///    If the word in the second source is negative, calculate the two's
651 ///    complement of the corresponding word in the first source, and write that
652 ///    value to the destination. If the word in the second source is positive,
653 ///    copy the corresponding word from the first source to the destination. If
654 ///    the word in the second source is zero, clear the corresponding word in
655 ///    the destination.
656 ///
657 /// \headerfile <x86intrin.h>
658 ///
659 /// This intrinsic corresponds to the \c VPSIGNW instruction.
660 ///
661 /// \param __a
662 ///    A 128-bit integer vector containing the values to be copied.
663 /// \param __b
664 ///    A 128-bit integer vector containing control words corresponding to
665 ///    positions in the destination.
666 /// \returns A 128-bit integer vector containing the resultant values.
667 static __inline__ __m128i __DEFAULT_FN_ATTRS
668 _mm_sign_epi16(__m128i __a, __m128i __b)
669 {
670     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
671 }
672 
673 /// For each 32-bit integer in the first source operand, perform one of
674 ///    the following actions as specified by the second source operand.
675 ///
676 ///    If the doubleword in the second source is negative, calculate the two's
677 ///    complement of the corresponding word in the first source, and write that
678 ///    value to the destination. If the doubleword in the second source is
679 ///    positive, copy the corresponding word from the first source to the
680 ///    destination. If the doubleword in the second source is zero, clear the
681 ///    corresponding word in the destination.
682 ///
683 /// \headerfile <x86intrin.h>
684 ///
685 /// This intrinsic corresponds to the \c VPSIGND instruction.
686 ///
687 /// \param __a
688 ///    A 128-bit integer vector containing the values to be copied.
689 /// \param __b
690 ///    A 128-bit integer vector containing control doublewords corresponding to
691 ///    positions in the destination.
692 /// \returns A 128-bit integer vector containing the resultant values.
693 static __inline__ __m128i __DEFAULT_FN_ATTRS
694 _mm_sign_epi32(__m128i __a, __m128i __b)
695 {
696     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
697 }
698 
699 /// For each 8-bit integer in the first source operand, perform one of
700 ///    the following actions as specified by the second source operand.
701 ///
702 ///    If the byte in the second source is negative, calculate the two's
703 ///    complement of the corresponding byte in the first source, and write that
704 ///    value to the destination. If the byte in the second source is positive,
705 ///    copy the corresponding byte from the first source to the destination. If
706 ///    the byte in the second source is zero, clear the corresponding byte in
707 ///    the destination.
708 ///
709 /// \headerfile <x86intrin.h>
710 ///
711 /// This intrinsic corresponds to the \c PSIGNB instruction.
712 ///
713 /// \param __a
714 ///    A 64-bit integer vector containing the values to be copied.
715 /// \param __b
716 ///    A 64-bit integer vector containing control bytes corresponding to
717 ///    positions in the destination.
718 /// \returns A 64-bit integer vector containing the resultant values.
719 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
720 _mm_sign_pi8(__m64 __a, __m64 __b)
721 {
722     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
723 }
724 
725 /// For each 16-bit integer in the first source operand, perform one of
726 ///    the following actions as specified by the second source operand.
727 ///
728 ///    If the word in the second source is negative, calculate the two's
729 ///    complement of the corresponding word in the first source, and write that
730 ///    value to the destination. If the word in the second source is positive,
731 ///    copy the corresponding word from the first source to the destination. If
732 ///    the word in the second source is zero, clear the corresponding word in
733 ///    the destination.
734 ///
735 /// \headerfile <x86intrin.h>
736 ///
737 /// This intrinsic corresponds to the \c PSIGNW instruction.
738 ///
739 /// \param __a
740 ///    A 64-bit integer vector containing the values to be copied.
741 /// \param __b
742 ///    A 64-bit integer vector containing control words corresponding to
743 ///    positions in the destination.
744 /// \returns A 64-bit integer vector containing the resultant values.
745 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
746 _mm_sign_pi16(__m64 __a, __m64 __b)
747 {
748     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
749 }
750 
751 /// For each 32-bit integer in the first source operand, perform one of
752 ///    the following actions as specified by the second source operand.
753 ///
754 ///    If the doubleword in the second source is negative, calculate the two's
755 ///    complement of the corresponding doubleword in the first source, and
756 ///    write that value to the destination. If the doubleword in the second
757 ///    source is positive, copy the corresponding doubleword from the first
758 ///    source to the destination. If the doubleword in the second source is
759 ///    zero, clear the corresponding doubleword in the destination.
760 ///
761 /// \headerfile <x86intrin.h>
762 ///
763 /// This intrinsic corresponds to the \c PSIGND instruction.
764 ///
765 /// \param __a
766 ///    A 64-bit integer vector containing the values to be copied.
767 /// \param __b
768 ///    A 64-bit integer vector containing two control doublewords corresponding
769 ///    to positions in the destination.
770 /// \returns A 64-bit integer vector containing the resultant values.
771 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
772 _mm_sign_pi32(__m64 __a, __m64 __b)
773 {
774     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
775 }
776 
777 #undef __DEFAULT_FN_ATTRS
778 #undef __DEFAULT_FN_ATTRS_MMX
779 
780 #endif /* __TMMINTRIN_H */
781