xref: /freebsd/contrib/llvm-project/clang/lib/Headers/tmmintrin.h (revision ce6a89e27cd190313be39bb479880aeda4778436)
1 /*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __TMMINTRIN_H
11 #define __TMMINTRIN_H
12 
13 #include <pmmintrin.h>
14 
15 /* Define the default attributes for the functions in this file. */
16 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
17 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
18 
19 /// Computes the absolute value of each of the packed 8-bit signed
20 ///    integers in the source operand and stores the 8-bit unsigned integer
21 ///    results in the destination.
22 ///
23 /// \headerfile <x86intrin.h>
24 ///
25 /// This intrinsic corresponds to the \c PABSB instruction.
26 ///
27 /// \param __a
28 ///    A 64-bit vector of [8 x i8].
29 /// \returns A 64-bit integer vector containing the absolute values of the
30 ///    elements in the operand.
31 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
32 _mm_abs_pi8(__m64 __a)
33 {
34     return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
35 }
36 
37 /// Computes the absolute value of each of the packed 8-bit signed
38 ///    integers in the source operand and stores the 8-bit unsigned integer
39 ///    results in the destination.
40 ///
41 /// \headerfile <x86intrin.h>
42 ///
43 /// This intrinsic corresponds to the \c VPABSB instruction.
44 ///
45 /// \param __a
46 ///    A 128-bit vector of [16 x i8].
47 /// \returns A 128-bit integer vector containing the absolute values of the
48 ///    elements in the operand.
49 static __inline__ __m128i __DEFAULT_FN_ATTRS
50 _mm_abs_epi8(__m128i __a)
51 {
52     return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
53 }
54 
55 /// Computes the absolute value of each of the packed 16-bit signed
56 ///    integers in the source operand and stores the 16-bit unsigned integer
57 ///    results in the destination.
58 ///
59 /// \headerfile <x86intrin.h>
60 ///
61 /// This intrinsic corresponds to the \c PABSW instruction.
62 ///
63 /// \param __a
64 ///    A 64-bit vector of [4 x i16].
65 /// \returns A 64-bit integer vector containing the absolute values of the
66 ///    elements in the operand.
67 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
68 _mm_abs_pi16(__m64 __a)
69 {
70     return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
71 }
72 
73 /// Computes the absolute value of each of the packed 16-bit signed
74 ///    integers in the source operand and stores the 16-bit unsigned integer
75 ///    results in the destination.
76 ///
77 /// \headerfile <x86intrin.h>
78 ///
79 /// This intrinsic corresponds to the \c VPABSW instruction.
80 ///
81 /// \param __a
82 ///    A 128-bit vector of [8 x i16].
83 /// \returns A 128-bit integer vector containing the absolute values of the
84 ///    elements in the operand.
85 static __inline__ __m128i __DEFAULT_FN_ATTRS
86 _mm_abs_epi16(__m128i __a)
87 {
88     return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
89 }
90 
91 /// Computes the absolute value of each of the packed 32-bit signed
92 ///    integers in the source operand and stores the 32-bit unsigned integer
93 ///    results in the destination.
94 ///
95 /// \headerfile <x86intrin.h>
96 ///
97 /// This intrinsic corresponds to the \c PABSD instruction.
98 ///
99 /// \param __a
100 ///    A 64-bit vector of [2 x i32].
101 /// \returns A 64-bit integer vector containing the absolute values of the
102 ///    elements in the operand.
103 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
104 _mm_abs_pi32(__m64 __a)
105 {
106     return (__m64)__builtin_ia32_pabsd((__v2si)__a);
107 }
108 
109 /// Computes the absolute value of each of the packed 32-bit signed
110 ///    integers in the source operand and stores the 32-bit unsigned integer
111 ///    results in the destination.
112 ///
113 /// \headerfile <x86intrin.h>
114 ///
115 /// This intrinsic corresponds to the \c VPABSD instruction.
116 ///
117 /// \param __a
118 ///    A 128-bit vector of [4 x i32].
119 /// \returns A 128-bit integer vector containing the absolute values of the
120 ///    elements in the operand.
121 static __inline__ __m128i __DEFAULT_FN_ATTRS
122 _mm_abs_epi32(__m128i __a)
123 {
124     return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
125 }
126 
127 /// Concatenates the two 128-bit integer vector operands, and
128 ///    right-shifts the result by the number of bytes specified in the immediate
129 ///    operand.
130 ///
131 /// \headerfile <x86intrin.h>
132 ///
133 /// \code
134 /// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
135 /// \endcode
136 ///
137 /// This intrinsic corresponds to the \c PALIGNR instruction.
138 ///
139 /// \param a
140 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
141 /// \param b
142 ///    A 128-bit vector of [16 x i8] containing one of the source operands.
143 /// \param n
144 ///    An immediate operand specifying how many bytes to right-shift the result.
145 /// \returns A 128-bit integer vector containing the concatenated right-shifted
146 ///    value.
147 #define _mm_alignr_epi8(a, b, n) \
148   (__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
149                                      (__v16qi)(__m128i)(b), (n))
150 
151 /// Concatenates the two 64-bit integer vector operands, and right-shifts
152 ///    the result by the number of bytes specified in the immediate operand.
153 ///
154 /// \headerfile <x86intrin.h>
155 ///
156 /// \code
157 /// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
158 /// \endcode
159 ///
160 /// This intrinsic corresponds to the \c PALIGNR instruction.
161 ///
162 /// \param a
163 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
164 /// \param b
165 ///    A 64-bit vector of [8 x i8] containing one of the source operands.
166 /// \param n
167 ///    An immediate operand specifying how many bytes to right-shift the result.
168 /// \returns A 64-bit integer vector containing the concatenated right-shifted
169 ///    value.
170 #define _mm_alignr_pi8(a, b, n) \
171   (__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))
172 
173 /// Horizontally adds the adjacent pairs of values contained in 2 packed
174 ///    128-bit vectors of [8 x i16].
175 ///
176 /// \headerfile <x86intrin.h>
177 ///
178 /// This intrinsic corresponds to the \c VPHADDW instruction.
179 ///
180 /// \param __a
181 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
182 ///    horizontal sums of the values are stored in the lower bits of the
183 ///    destination.
184 /// \param __b
185 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
186 ///    horizontal sums of the values are stored in the upper bits of the
187 ///    destination.
188 /// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
189 ///    both operands.
190 static __inline__ __m128i __DEFAULT_FN_ATTRS
191 _mm_hadd_epi16(__m128i __a, __m128i __b)
192 {
193     return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
194 }
195 
196 /// Horizontally adds the adjacent pairs of values contained in 2 packed
197 ///    128-bit vectors of [4 x i32].
198 ///
199 /// \headerfile <x86intrin.h>
200 ///
201 /// This intrinsic corresponds to the \c VPHADDD instruction.
202 ///
203 /// \param __a
204 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
205 ///    horizontal sums of the values are stored in the lower bits of the
206 ///    destination.
207 /// \param __b
208 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
209 ///    horizontal sums of the values are stored in the upper bits of the
210 ///    destination.
211 /// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
212 ///    both operands.
213 static __inline__ __m128i __DEFAULT_FN_ATTRS
214 _mm_hadd_epi32(__m128i __a, __m128i __b)
215 {
216     return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
217 }
218 
219 /// Horizontally adds the adjacent pairs of values contained in 2 packed
220 ///    64-bit vectors of [4 x i16].
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the \c PHADDW instruction.
225 ///
226 /// \param __a
227 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
228 ///    horizontal sums of the values are stored in the lower bits of the
229 ///    destination.
230 /// \param __b
231 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
232 ///    horizontal sums of the values are stored in the upper bits of the
233 ///    destination.
234 /// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
235 ///    operands.
236 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
237 _mm_hadd_pi16(__m64 __a, __m64 __b)
238 {
239     return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
240 }
241 
242 /// Horizontally adds the adjacent pairs of values contained in 2 packed
243 ///    64-bit vectors of [2 x i32].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the \c PHADDD instruction.
248 ///
249 /// \param __a
250 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
251 ///    horizontal sums of the values are stored in the lower bits of the
252 ///    destination.
253 /// \param __b
254 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
255 ///    horizontal sums of the values are stored in the upper bits of the
256 ///    destination.
257 /// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
258 ///    operands.
259 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
260 _mm_hadd_pi32(__m64 __a, __m64 __b)
261 {
262     return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
263 }
264 
265 /// Horizontally adds the adjacent pairs of values contained in 2 packed
266 ///    128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
267 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
268 ///    0x8000.
269 ///
270 /// \headerfile <x86intrin.h>
271 ///
272 /// This intrinsic corresponds to the \c VPHADDSW instruction.
273 ///
274 /// \param __a
275 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
276 ///    horizontal sums of the values are stored in the lower bits of the
277 ///    destination.
278 /// \param __b
279 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
280 ///    horizontal sums of the values are stored in the upper bits of the
281 ///    destination.
282 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
283 ///    sums of both operands.
284 static __inline__ __m128i __DEFAULT_FN_ATTRS
285 _mm_hadds_epi16(__m128i __a, __m128i __b)
286 {
287     return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
288 }
289 
290 /// Horizontally adds the adjacent pairs of values contained in 2 packed
291 ///    64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
292 ///    saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
293 ///    0x8000.
294 ///
295 /// \headerfile <x86intrin.h>
296 ///
297 /// This intrinsic corresponds to the \c PHADDSW instruction.
298 ///
299 /// \param __a
300 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
301 ///    horizontal sums of the values are stored in the lower bits of the
302 ///    destination.
303 /// \param __b
304 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
305 ///    horizontal sums of the values are stored in the upper bits of the
306 ///    destination.
307 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
308 ///    sums of both operands.
309 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
310 _mm_hadds_pi16(__m64 __a, __m64 __b)
311 {
312     return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
313 }
314 
315 /// Horizontally subtracts the adjacent pairs of values contained in 2
316 ///    packed 128-bit vectors of [8 x i16].
317 ///
318 /// \headerfile <x86intrin.h>
319 ///
320 /// This intrinsic corresponds to the \c VPHSUBW instruction.
321 ///
322 /// \param __a
323 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
324 ///    horizontal differences between the values are stored in the lower bits of
325 ///    the destination.
326 /// \param __b
327 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
328 ///    horizontal differences between the values are stored in the upper bits of
329 ///    the destination.
330 /// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
331 ///    of both operands.
332 static __inline__ __m128i __DEFAULT_FN_ATTRS
333 _mm_hsub_epi16(__m128i __a, __m128i __b)
334 {
335     return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
336 }
337 
338 /// Horizontally subtracts the adjacent pairs of values contained in 2
339 ///    packed 128-bit vectors of [4 x i32].
340 ///
341 /// \headerfile <x86intrin.h>
342 ///
343 /// This intrinsic corresponds to the \c VPHSUBD instruction.
344 ///
345 /// \param __a
346 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
347 ///    horizontal differences between the values are stored in the lower bits of
348 ///    the destination.
349 /// \param __b
350 ///    A 128-bit vector of [4 x i32] containing one of the source operands. The
351 ///    horizontal differences between the values are stored in the upper bits of
352 ///    the destination.
353 /// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
354 ///    of both operands.
355 static __inline__ __m128i __DEFAULT_FN_ATTRS
356 _mm_hsub_epi32(__m128i __a, __m128i __b)
357 {
358     return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
359 }
360 
361 /// Horizontally subtracts the adjacent pairs of values contained in 2
362 ///    packed 64-bit vectors of [4 x i16].
363 ///
364 /// \headerfile <x86intrin.h>
365 ///
366 /// This intrinsic corresponds to the \c PHSUBW instruction.
367 ///
368 /// \param __a
369 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
370 ///    horizontal differences between the values are stored in the lower bits of
371 ///    the destination.
372 /// \param __b
373 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
374 ///    horizontal differences between the values are stored in the upper bits of
375 ///    the destination.
376 /// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
377 ///    of both operands.
378 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
379 _mm_hsub_pi16(__m64 __a, __m64 __b)
380 {
381     return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
382 }
383 
384 /// Horizontally subtracts the adjacent pairs of values contained in 2
385 ///    packed 64-bit vectors of [2 x i32].
386 ///
387 /// \headerfile <x86intrin.h>
388 ///
389 /// This intrinsic corresponds to the \c PHSUBD instruction.
390 ///
391 /// \param __a
392 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
393 ///    horizontal differences between the values are stored in the lower bits of
394 ///    the destination.
395 /// \param __b
396 ///    A 64-bit vector of [2 x i32] containing one of the source operands. The
397 ///    horizontal differences between the values are stored in the upper bits of
398 ///    the destination.
399 /// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
400 ///    of both operands.
401 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
402 _mm_hsub_pi32(__m64 __a, __m64 __b)
403 {
404     return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
405 }
406 
407 /// Horizontally subtracts the adjacent pairs of values contained in 2
408 ///    packed 128-bit vectors of [8 x i16]. Positive differences greater than
409 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
410 ///    saturated to 0x8000.
411 ///
412 /// \headerfile <x86intrin.h>
413 ///
414 /// This intrinsic corresponds to the \c VPHSUBSW instruction.
415 ///
416 /// \param __a
417 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
418 ///    horizontal differences between the values are stored in the lower bits of
419 ///    the destination.
420 /// \param __b
421 ///    A 128-bit vector of [8 x i16] containing one of the source operands. The
422 ///    horizontal differences between the values are stored in the upper bits of
423 ///    the destination.
424 /// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
425 ///    differences of both operands.
426 static __inline__ __m128i __DEFAULT_FN_ATTRS
427 _mm_hsubs_epi16(__m128i __a, __m128i __b)
428 {
429     return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
430 }
431 
432 /// Horizontally subtracts the adjacent pairs of values contained in 2
433 ///    packed 64-bit vectors of [4 x i16]. Positive differences greater than
434 ///    0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
435 ///    saturated to 0x8000.
436 ///
437 /// \headerfile <x86intrin.h>
438 ///
439 /// This intrinsic corresponds to the \c PHSUBSW instruction.
440 ///
441 /// \param __a
442 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
443 ///    horizontal differences between the values are stored in the lower bits of
444 ///    the destination.
445 /// \param __b
446 ///    A 64-bit vector of [4 x i16] containing one of the source operands. The
447 ///    horizontal differences between the values are stored in the upper bits of
448 ///    the destination.
449 /// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
450 ///    differences of both operands.
451 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
452 _mm_hsubs_pi16(__m64 __a, __m64 __b)
453 {
454     return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
455 }
456 
457 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
458 ///    values contained in the first source operand and packed 8-bit signed
459 ///    integer values contained in the second source operand, adds pairs of
460 ///    contiguous products with signed saturation, and writes the 16-bit sums to
461 ///    the corresponding bits in the destination.
462 ///
463 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
464 ///    both operands are multiplied, and the sum of both results is written to
465 ///    bits [15:0] of the destination.
466 ///
467 /// \headerfile <x86intrin.h>
468 ///
469 /// This intrinsic corresponds to the \c VPMADDUBSW instruction.
470 ///
471 /// \param __a
472 ///    A 128-bit integer vector containing the first source operand.
473 /// \param __b
474 ///    A 128-bit integer vector containing the second source operand.
475 /// \returns A 128-bit integer vector containing the sums of products of both
476 ///    operands: \n
477 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
478 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
479 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
480 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
481 ///    \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
482 ///    \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
483 ///    \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
484 ///    \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
485 static __inline__ __m128i __DEFAULT_FN_ATTRS
486 _mm_maddubs_epi16(__m128i __a, __m128i __b)
487 {
488     return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
489 }
490 
491 /// Multiplies corresponding pairs of packed 8-bit unsigned integer
492 ///    values contained in the first source operand and packed 8-bit signed
493 ///    integer values contained in the second source operand, adds pairs of
494 ///    contiguous products with signed saturation, and writes the 16-bit sums to
495 ///    the corresponding bits in the destination.
496 ///
497 ///    For example, bits [7:0] of both operands are multiplied, bits [15:8] of
498 ///    both operands are multiplied, and the sum of both results is written to
499 ///    bits [15:0] of the destination.
500 ///
501 /// \headerfile <x86intrin.h>
502 ///
503 /// This intrinsic corresponds to the \c PMADDUBSW instruction.
504 ///
505 /// \param __a
506 ///    A 64-bit integer vector containing the first source operand.
507 /// \param __b
508 ///    A 64-bit integer vector containing the second source operand.
509 /// \returns A 64-bit integer vector containing the sums of products of both
510 ///    operands: \n
511 ///    \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
512 ///    \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
513 ///    \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
514 ///    \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
515 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
516 _mm_maddubs_pi16(__m64 __a, __m64 __b)
517 {
518     return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
519 }
520 
521 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
522 ///    products to the 18 most significant bits by right-shifting, rounds the
523 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
524 ///
525 /// \headerfile <x86intrin.h>
526 ///
527 /// This intrinsic corresponds to the \c VPMULHRSW instruction.
528 ///
529 /// \param __a
530 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
531 /// \param __b
532 ///    A 128-bit vector of [8 x i16] containing one of the source operands.
533 /// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
534 ///    products of both operands.
535 static __inline__ __m128i __DEFAULT_FN_ATTRS
536 _mm_mulhrs_epi16(__m128i __a, __m128i __b)
537 {
538     return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
539 }
540 
541 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
542 ///    products to the 18 most significant bits by right-shifting, rounds the
543 ///    truncated value by adding 1, and writes bits [16:1] to the destination.
544 ///
545 /// \headerfile <x86intrin.h>
546 ///
547 /// This intrinsic corresponds to the \c PMULHRSW instruction.
548 ///
549 /// \param __a
550 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
551 /// \param __b
552 ///    A 64-bit vector of [4 x i16] containing one of the source operands.
553 /// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
554 ///    products of both operands.
555 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
556 _mm_mulhrs_pi16(__m64 __a, __m64 __b)
557 {
558     return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
559 }
560 
561 /// Copies the 8-bit integers from a 128-bit integer vector to the
562 ///    destination or clears 8-bit values in the destination, as specified by
563 ///    the second source operand.
564 ///
565 /// \headerfile <x86intrin.h>
566 ///
567 /// This intrinsic corresponds to the \c VPSHUFB instruction.
568 ///
569 /// \param __a
570 ///    A 128-bit integer vector containing the values to be copied.
571 /// \param __b
572 ///    A 128-bit integer vector containing control bytes corresponding to
573 ///    positions in the destination:
574 ///    Bit 7: \n
575 ///    1: Clear the corresponding byte in the destination. \n
576 ///    0: Copy the selected source byte to the corresponding byte in the
577 ///    destination. \n
578 ///    Bits [6:4] Reserved.  \n
579 ///    Bits [3:0] select the source byte to be copied.
580 /// \returns A 128-bit integer vector containing the copied or cleared values.
581 static __inline__ __m128i __DEFAULT_FN_ATTRS
582 _mm_shuffle_epi8(__m128i __a, __m128i __b)
583 {
584     return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
585 }
586 
587 /// Copies the 8-bit integers from a 64-bit integer vector to the
588 ///    destination or clears 8-bit values in the destination, as specified by
589 ///    the second source operand.
590 ///
591 /// \headerfile <x86intrin.h>
592 ///
593 /// This intrinsic corresponds to the \c PSHUFB instruction.
594 ///
595 /// \param __a
596 ///    A 64-bit integer vector containing the values to be copied.
597 /// \param __b
598 ///    A 64-bit integer vector containing control bytes corresponding to
599 ///    positions in the destination:
600 ///    Bit 7: \n
601 ///    1: Clear the corresponding byte in the destination. \n
602 ///    0: Copy the selected source byte to the corresponding byte in the
603 ///    destination. \n
604 ///    Bits [3:0] select the source byte to be copied.
605 /// \returns A 64-bit integer vector containing the copied or cleared values.
606 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
607 _mm_shuffle_pi8(__m64 __a, __m64 __b)
608 {
609     return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
610 }
611 
612 /// For each 8-bit integer in the first source operand, perform one of
613 ///    the following actions as specified by the second source operand.
614 ///
615 ///    If the byte in the second source is negative, calculate the two's
616 ///    complement of the corresponding byte in the first source, and write that
617 ///    value to the destination. If the byte in the second source is positive,
618 ///    copy the corresponding byte from the first source to the destination. If
619 ///    the byte in the second source is zero, clear the corresponding byte in
620 ///    the destination.
621 ///
622 /// \headerfile <x86intrin.h>
623 ///
624 /// This intrinsic corresponds to the \c VPSIGNB instruction.
625 ///
626 /// \param __a
627 ///    A 128-bit integer vector containing the values to be copied.
628 /// \param __b
629 ///    A 128-bit integer vector containing control bytes corresponding to
630 ///    positions in the destination.
631 /// \returns A 128-bit integer vector containing the resultant values.
632 static __inline__ __m128i __DEFAULT_FN_ATTRS
633 _mm_sign_epi8(__m128i __a, __m128i __b)
634 {
635     return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
636 }
637 
638 /// For each 16-bit integer in the first source operand, perform one of
639 ///    the following actions as specified by the second source operand.
640 ///
641 ///    If the word in the second source is negative, calculate the two's
642 ///    complement of the corresponding word in the first source, and write that
643 ///    value to the destination. If the word in the second source is positive,
644 ///    copy the corresponding word from the first source to the destination. If
645 ///    the word in the second source is zero, clear the corresponding word in
646 ///    the destination.
647 ///
648 /// \headerfile <x86intrin.h>
649 ///
650 /// This intrinsic corresponds to the \c VPSIGNW instruction.
651 ///
652 /// \param __a
653 ///    A 128-bit integer vector containing the values to be copied.
654 /// \param __b
655 ///    A 128-bit integer vector containing control words corresponding to
656 ///    positions in the destination.
657 /// \returns A 128-bit integer vector containing the resultant values.
658 static __inline__ __m128i __DEFAULT_FN_ATTRS
659 _mm_sign_epi16(__m128i __a, __m128i __b)
660 {
661     return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
662 }
663 
664 /// For each 32-bit integer in the first source operand, perform one of
665 ///    the following actions as specified by the second source operand.
666 ///
667 ///    If the doubleword in the second source is negative, calculate the two's
668 ///    complement of the corresponding word in the first source, and write that
669 ///    value to the destination. If the doubleword in the second source is
670 ///    positive, copy the corresponding word from the first source to the
671 ///    destination. If the doubleword in the second source is zero, clear the
672 ///    corresponding word in the destination.
673 ///
674 /// \headerfile <x86intrin.h>
675 ///
676 /// This intrinsic corresponds to the \c VPSIGND instruction.
677 ///
678 /// \param __a
679 ///    A 128-bit integer vector containing the values to be copied.
680 /// \param __b
681 ///    A 128-bit integer vector containing control doublewords corresponding to
682 ///    positions in the destination.
683 /// \returns A 128-bit integer vector containing the resultant values.
684 static __inline__ __m128i __DEFAULT_FN_ATTRS
685 _mm_sign_epi32(__m128i __a, __m128i __b)
686 {
687     return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
688 }
689 
690 /// For each 8-bit integer in the first source operand, perform one of
691 ///    the following actions as specified by the second source operand.
692 ///
693 ///    If the byte in the second source is negative, calculate the two's
694 ///    complement of the corresponding byte in the first source, and write that
695 ///    value to the destination. If the byte in the second source is positive,
696 ///    copy the corresponding byte from the first source to the destination. If
697 ///    the byte in the second source is zero, clear the corresponding byte in
698 ///    the destination.
699 ///
700 /// \headerfile <x86intrin.h>
701 ///
702 /// This intrinsic corresponds to the \c PSIGNB instruction.
703 ///
704 /// \param __a
705 ///    A 64-bit integer vector containing the values to be copied.
706 /// \param __b
707 ///    A 64-bit integer vector containing control bytes corresponding to
708 ///    positions in the destination.
709 /// \returns A 64-bit integer vector containing the resultant values.
710 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
711 _mm_sign_pi8(__m64 __a, __m64 __b)
712 {
713     return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
714 }
715 
716 /// For each 16-bit integer in the first source operand, perform one of
717 ///    the following actions as specified by the second source operand.
718 ///
719 ///    If the word in the second source is negative, calculate the two's
720 ///    complement of the corresponding word in the first source, and write that
721 ///    value to the destination. If the word in the second source is positive,
722 ///    copy the corresponding word from the first source to the destination. If
723 ///    the word in the second source is zero, clear the corresponding word in
724 ///    the destination.
725 ///
726 /// \headerfile <x86intrin.h>
727 ///
728 /// This intrinsic corresponds to the \c PSIGNW instruction.
729 ///
730 /// \param __a
731 ///    A 64-bit integer vector containing the values to be copied.
732 /// \param __b
733 ///    A 64-bit integer vector containing control words corresponding to
734 ///    positions in the destination.
735 /// \returns A 64-bit integer vector containing the resultant values.
736 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
737 _mm_sign_pi16(__m64 __a, __m64 __b)
738 {
739     return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
740 }
741 
742 /// For each 32-bit integer in the first source operand, perform one of
743 ///    the following actions as specified by the second source operand.
744 ///
745 ///    If the doubleword in the second source is negative, calculate the two's
746 ///    complement of the corresponding doubleword in the first source, and
747 ///    write that value to the destination. If the doubleword in the second
748 ///    source is positive, copy the corresponding doubleword from the first
749 ///    source to the destination. If the doubleword in the second source is
750 ///    zero, clear the corresponding doubleword in the destination.
751 ///
752 /// \headerfile <x86intrin.h>
753 ///
754 /// This intrinsic corresponds to the \c PSIGND instruction.
755 ///
756 /// \param __a
757 ///    A 64-bit integer vector containing the values to be copied.
758 /// \param __b
759 ///    A 64-bit integer vector containing two control doublewords corresponding
760 ///    to positions in the destination.
761 /// \returns A 64-bit integer vector containing the resultant values.
762 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
763 _mm_sign_pi32(__m64 __a, __m64 __b)
764 {
765     return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
766 }
767 
768 #undef __DEFAULT_FN_ATTRS
769 #undef __DEFAULT_FN_ATTRS_MMX
770 
771 #endif /* __TMMINTRIN_H */
772