xref: /freebsd/contrib/llvm-project/clang/lib/Headers/emmintrin.h (revision 370e009188ba90c3290b1479aa06ec98b66e140a)
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24     __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines.  */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 /* Define the default attributes for the functions in this file. */
42 #define __DEFAULT_FN_ATTRS                                                     \
43   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
44                  __min_vector_width__(128)))
45 #define __DEFAULT_FN_ATTRS_MMX                                                 \
46   __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"),       \
47                  __min_vector_width__(64)))
48 
49 /// Adds lower double-precision values in both operands and returns the
50 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
51 ///    are copied from the upper double-precision value of the first operand.
52 ///
53 /// \headerfile <x86intrin.h>
54 ///
55 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
56 ///
57 /// \param __a
58 ///    A 128-bit vector of [2 x double] containing one of the source operands.
59 /// \param __b
60 ///    A 128-bit vector of [2 x double] containing one of the source operands.
61 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
62 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
63 ///    from the upper 64 bits of the first source operand.
64 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
65                                                         __m128d __b) {
66   __a[0] += __b[0];
67   return __a;
68 }
69 
70 /// Adds two 128-bit vectors of [2 x double].
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
75 ///
76 /// \param __a
77 ///    A 128-bit vector of [2 x double] containing one of the source operands.
78 /// \param __b
79 ///    A 128-bit vector of [2 x double] containing one of the source operands.
80 /// \returns A 128-bit vector of [2 x double] containing the sums of both
81 ///    operands.
82 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
83                                                         __m128d __b) {
84   return (__m128d)((__v2df)__a + (__v2df)__b);
85 }
86 
87 /// Subtracts the lower double-precision value of the second operand
88 ///    from the lower double-precision value of the first operand and returns
89 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
90 ///    the result are copied from the upper double-precision value of the first
91 ///    operand.
92 ///
93 /// \headerfile <x86intrin.h>
94 ///
95 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
96 ///
97 /// \param __a
98 ///    A 128-bit vector of [2 x double] containing the minuend.
99 /// \param __b
100 ///    A 128-bit vector of [2 x double] containing the subtrahend.
101 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
102 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
103 ///    copied from the upper 64 bits of the first source operand.
104 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
105                                                         __m128d __b) {
106   __a[0] -= __b[0];
107   return __a;
108 }
109 
110 /// Subtracts two 128-bit vectors of [2 x double].
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
115 ///
116 /// \param __a
117 ///    A 128-bit vector of [2 x double] containing the minuend.
118 /// \param __b
119 ///    A 128-bit vector of [2 x double] containing the subtrahend.
120 /// \returns A 128-bit vector of [2 x double] containing the differences between
121 ///    both operands.
122 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
123                                                         __m128d __b) {
124   return (__m128d)((__v2df)__a - (__v2df)__b);
125 }
126 
127 /// Multiplies lower double-precision values in both operands and returns
128 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
129 ///    result are copied from the upper double-precision value of the first
130 ///    operand.
131 ///
132 /// \headerfile <x86intrin.h>
133 ///
134 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
135 ///
136 /// \param __a
137 ///    A 128-bit vector of [2 x double] containing one of the source operands.
138 /// \param __b
139 ///    A 128-bit vector of [2 x double] containing one of the source operands.
140 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
141 ///    product of the lower 64 bits of both operands. The upper 64 bits are
142 ///    copied from the upper 64 bits of the first source operand.
143 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
144                                                         __m128d __b) {
145   __a[0] *= __b[0];
146   return __a;
147 }
148 
149 /// Multiplies two 128-bit vectors of [2 x double].
150 ///
151 /// \headerfile <x86intrin.h>
152 ///
153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
154 ///
155 /// \param __a
156 ///    A 128-bit vector of [2 x double] containing one of the operands.
157 /// \param __b
158 ///    A 128-bit vector of [2 x double] containing one of the operands.
159 /// \returns A 128-bit vector of [2 x double] containing the products of both
160 ///    operands.
161 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
162                                                         __m128d __b) {
163   return (__m128d)((__v2df)__a * (__v2df)__b);
164 }
165 
166 /// Divides the lower double-precision value of the first operand by the
167 ///    lower double-precision value of the second operand and returns the
168 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
169 ///    result are copied from the upper double-precision value of the first
170 ///    operand.
171 ///
172 /// \headerfile <x86intrin.h>
173 ///
174 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
175 ///
176 /// \param __a
177 ///    A 128-bit vector of [2 x double] containing the dividend.
178 /// \param __b
179 ///    A 128-bit vector of [2 x double] containing divisor.
180 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
181 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
182 ///    copied from the upper 64 bits of the first source operand.
183 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
184                                                         __m128d __b) {
185   __a[0] /= __b[0];
186   return __a;
187 }
188 
189 /// Performs an element-by-element division of two 128-bit vectors of
190 ///    [2 x double].
191 ///
192 /// \headerfile <x86intrin.h>
193 ///
194 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
195 ///
196 /// \param __a
197 ///    A 128-bit vector of [2 x double] containing the dividend.
198 /// \param __b
199 ///    A 128-bit vector of [2 x double] containing the divisor.
200 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
201 ///    operands.
202 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
203                                                         __m128d __b) {
204   return (__m128d)((__v2df)__a / (__v2df)__b);
205 }
206 
207 /// Calculates the square root of the lower double-precision value of
208 ///    the second operand and returns it in the lower 64 bits of the result.
209 ///    The upper 64 bits of the result are copied from the upper
210 ///    double-precision value of the first operand.
211 ///
212 /// \headerfile <x86intrin.h>
213 ///
214 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
215 ///
216 /// \param __a
217 ///    A 128-bit vector of [2 x double] containing one of the operands. The
218 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
219 ///    result.
220 /// \param __b
221 ///    A 128-bit vector of [2 x double] containing one of the operands. The
222 ///    square root is calculated using the lower 64 bits of this operand.
223 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
224 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
225 ///    bits are copied from the upper 64 bits of operand \a __a.
226 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
227                                                          __m128d __b) {
228   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
229   return __extension__(__m128d){__c[0], __a[1]};
230 }
231 
232 /// Calculates the square root of the each of two values stored in a
233 ///    128-bit vector of [2 x double].
234 ///
235 /// \headerfile <x86intrin.h>
236 ///
237 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
238 ///
239 /// \param __a
240 ///    A 128-bit vector of [2 x double].
241 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
242 ///    values in the operand.
243 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
244   return __builtin_ia32_sqrtpd((__v2df)__a);
245 }
246 
247 /// Compares lower 64-bit double-precision values of both operands, and
248 ///    returns the lesser of the pair of values in the lower 64-bits of the
249 ///    result. The upper 64 bits of the result are copied from the upper
250 ///    double-precision value of the first operand.
251 ///
252 /// \headerfile <x86intrin.h>
253 ///
254 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
255 ///
256 /// \param __a
257 ///    A 128-bit vector of [2 x double] containing one of the operands. The
258 ///    lower 64 bits of this operand are used in the comparison.
259 /// \param __b
260 ///    A 128-bit vector of [2 x double] containing one of the operands. The
261 ///    lower 64 bits of this operand are used in the comparison.
262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
263 ///    minimum value between both operands. The upper 64 bits are copied from
264 ///    the upper 64 bits of the first source operand.
265 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
266                                                         __m128d __b) {
267   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
268 }
269 
270 /// Performs element-by-element comparison of the two 128-bit vectors of
271 ///    [2 x double] and returns the vector containing the lesser of each pair of
272 ///    values.
273 ///
274 /// \headerfile <x86intrin.h>
275 ///
276 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
277 ///
278 /// \param __a
279 ///    A 128-bit vector of [2 x double] containing one of the operands.
280 /// \param __b
281 ///    A 128-bit vector of [2 x double] containing one of the operands.
282 /// \returns A 128-bit vector of [2 x double] containing the minimum values
283 ///    between both operands.
284 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
285                                                         __m128d __b) {
286   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
287 }
288 
289 /// Compares lower 64-bit double-precision values of both operands, and
290 ///    returns the greater of the pair of values in the lower 64-bits of the
291 ///    result. The upper 64 bits of the result are copied from the upper
292 ///    double-precision value of the first operand.
293 ///
294 /// \headerfile <x86intrin.h>
295 ///
296 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
297 ///
298 /// \param __a
299 ///    A 128-bit vector of [2 x double] containing one of the operands. The
300 ///    lower 64 bits of this operand are used in the comparison.
301 /// \param __b
302 ///    A 128-bit vector of [2 x double] containing one of the operands. The
303 ///    lower 64 bits of this operand are used in the comparison.
304 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
305 ///    maximum value between both operands. The upper 64 bits are copied from
306 ///    the upper 64 bits of the first source operand.
307 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
308                                                         __m128d __b) {
309   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
310 }
311 
312 /// Performs element-by-element comparison of the two 128-bit vectors of
313 ///    [2 x double] and returns the vector containing the greater of each pair
314 ///    of values.
315 ///
316 /// \headerfile <x86intrin.h>
317 ///
318 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
319 ///
320 /// \param __a
321 ///    A 128-bit vector of [2 x double] containing one of the operands.
322 /// \param __b
323 ///    A 128-bit vector of [2 x double] containing one of the operands.
324 /// \returns A 128-bit vector of [2 x double] containing the maximum values
325 ///    between both operands.
326 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
327                                                         __m128d __b) {
328   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
329 }
330 
331 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
332 ///
333 /// \headerfile <x86intrin.h>
334 ///
335 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
336 ///
337 /// \param __a
338 ///    A 128-bit vector of [2 x double] containing one of the source operands.
339 /// \param __b
340 ///    A 128-bit vector of [2 x double] containing one of the source operands.
341 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
342 ///    values between both operands.
343 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
344                                                         __m128d __b) {
345   return (__m128d)((__v2du)__a & (__v2du)__b);
346 }
347 
348 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
349 ///    the one's complement of the values contained in the first source operand.
350 ///
351 /// \headerfile <x86intrin.h>
352 ///
353 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
354 ///
355 /// \param __a
356 ///    A 128-bit vector of [2 x double] containing the left source operand. The
357 ///    one's complement of this value is used in the bitwise AND.
358 /// \param __b
359 ///    A 128-bit vector of [2 x double] containing the right source operand.
360 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
361 ///    values in the second operand and the one's complement of the first
362 ///    operand.
363 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
364                                                            __m128d __b) {
365   return (__m128d)(~(__v2du)__a & (__v2du)__b);
366 }
367 
368 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
369 ///
370 /// \headerfile <x86intrin.h>
371 ///
372 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
373 ///
374 /// \param __a
375 ///    A 128-bit vector of [2 x double] containing one of the source operands.
376 /// \param __b
377 ///    A 128-bit vector of [2 x double] containing one of the source operands.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
379 ///    values between both operands.
380 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
381                                                        __m128d __b) {
382   return (__m128d)((__v2du)__a | (__v2du)__b);
383 }
384 
385 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
386 ///
387 /// \headerfile <x86intrin.h>
388 ///
389 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
390 ///
391 /// \param __a
392 ///    A 128-bit vector of [2 x double] containing one of the source operands.
393 /// \param __b
394 ///    A 128-bit vector of [2 x double] containing one of the source operands.
395 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
396 ///    values between both operands.
397 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
398                                                         __m128d __b) {
399   return (__m128d)((__v2du)__a ^ (__v2du)__b);
400 }
401 
402 /// Compares each of the corresponding double-precision values of the
403 ///    128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
404 ///    for false, 0xFFFFFFFFFFFFFFFF for true.
405 ///
406 /// \headerfile <x86intrin.h>
407 ///
408 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
409 ///
410 /// \param __a
411 ///    A 128-bit vector of [2 x double].
412 /// \param __b
413 ///    A 128-bit vector of [2 x double].
414 /// \returns A 128-bit vector containing the comparison results.
415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
416                                                           __m128d __b) {
417   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
418 }
419 
420 /// Compares each of the corresponding double-precision values of the
421 ///    128-bit vectors of [2 x double] to determine if the values in the first
422 ///    operand are less than those in the second operand. Each comparison
423 ///    yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
424 ///
425 /// \headerfile <x86intrin.h>
426 ///
427 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
428 ///
429 /// \param __a
430 ///    A 128-bit vector of [2 x double].
431 /// \param __b
432 ///    A 128-bit vector of [2 x double].
433 /// \returns A 128-bit vector containing the comparison results.
434 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
435                                                           __m128d __b) {
436   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
437 }
438 
439 /// Compares each of the corresponding double-precision values of the
440 ///    128-bit vectors of [2 x double] to determine if the values in the first
441 ///    operand are less than or equal to those in the second operand.
442 ///
443 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
444 ///
445 /// \headerfile <x86intrin.h>
446 ///
447 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
448 ///
449 /// \param __a
450 ///    A 128-bit vector of [2 x double].
451 /// \param __b
452 ///    A 128-bit vector of [2 x double].
453 /// \returns A 128-bit vector containing the comparison results.
454 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
455                                                           __m128d __b) {
456   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
457 }
458 
459 /// Compares each of the corresponding double-precision values of the
460 ///    128-bit vectors of [2 x double] to determine if the values in the first
461 ///    operand are greater than those in the second operand.
462 ///
463 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
464 ///
465 /// \headerfile <x86intrin.h>
466 ///
467 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
468 ///
469 /// \param __a
470 ///    A 128-bit vector of [2 x double].
471 /// \param __b
472 ///    A 128-bit vector of [2 x double].
473 /// \returns A 128-bit vector containing the comparison results.
474 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
475                                                           __m128d __b) {
476   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
477 }
478 
479 /// Compares each of the corresponding double-precision values of the
480 ///    128-bit vectors of [2 x double] to determine if the values in the first
481 ///    operand are greater than or equal to those in the second operand.
482 ///
483 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
484 ///
485 /// \headerfile <x86intrin.h>
486 ///
487 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
488 ///
489 /// \param __a
490 ///    A 128-bit vector of [2 x double].
491 /// \param __b
492 ///    A 128-bit vector of [2 x double].
493 /// \returns A 128-bit vector containing the comparison results.
494 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
495                                                           __m128d __b) {
496   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
497 }
498 
499 /// Compares each of the corresponding double-precision values of the
500 ///    128-bit vectors of [2 x double] to determine if the values in the first
501 ///    operand are ordered with respect to those in the second operand.
502 ///
503 ///    A pair of double-precision values are "ordered" with respect to each
504 ///    other if neither value is a NaN. Each comparison yields 0x0 for false,
505 ///    0xFFFFFFFFFFFFFFFF for true.
506 ///
507 /// \headerfile <x86intrin.h>
508 ///
509 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
510 ///
511 /// \param __a
512 ///    A 128-bit vector of [2 x double].
513 /// \param __b
514 ///    A 128-bit vector of [2 x double].
515 /// \returns A 128-bit vector containing the comparison results.
516 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
517                                                            __m128d __b) {
518   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
519 }
520 
521 /// Compares each of the corresponding double-precision values of the
522 ///    128-bit vectors of [2 x double] to determine if the values in the first
523 ///    operand are unordered with respect to those in the second operand.
524 ///
525 ///    A pair of double-precision values are "unordered" with respect to each
526 ///    other if one or both values are NaN. Each comparison yields 0x0 for
527 ///    false, 0xFFFFFFFFFFFFFFFF for true.
528 ///
529 /// \headerfile <x86intrin.h>
530 ///
531 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
532 ///   instruction.
533 ///
534 /// \param __a
535 ///    A 128-bit vector of [2 x double].
536 /// \param __b
537 ///    A 128-bit vector of [2 x double].
538 /// \returns A 128-bit vector containing the comparison results.
539 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
540                                                              __m128d __b) {
541   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
542 }
543 
544 /// Compares each of the corresponding double-precision values of the
545 ///    128-bit vectors of [2 x double] to determine if the values in the first
546 ///    operand are unequal to those in the second operand.
547 ///
548 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
549 ///
550 /// \headerfile <x86intrin.h>
551 ///
552 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
553 ///
554 /// \param __a
555 ///    A 128-bit vector of [2 x double].
556 /// \param __b
557 ///    A 128-bit vector of [2 x double].
558 /// \returns A 128-bit vector containing the comparison results.
559 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
560                                                            __m128d __b) {
561   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
562 }
563 
564 /// Compares each of the corresponding double-precision values of the
565 ///    128-bit vectors of [2 x double] to determine if the values in the first
566 ///    operand are not less than those in the second operand.
567 ///
568 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
569 ///
570 /// \headerfile <x86intrin.h>
571 ///
572 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
573 ///
574 /// \param __a
575 ///    A 128-bit vector of [2 x double].
576 /// \param __b
577 ///    A 128-bit vector of [2 x double].
578 /// \returns A 128-bit vector containing the comparison results.
579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
580                                                            __m128d __b) {
581   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
582 }
583 
584 /// Compares each of the corresponding double-precision values of the
585 ///    128-bit vectors of [2 x double] to determine if the values in the first
586 ///    operand are not less than or equal to those in the second operand.
587 ///
588 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
589 ///
590 /// \headerfile <x86intrin.h>
591 ///
592 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
593 ///
594 /// \param __a
595 ///    A 128-bit vector of [2 x double].
596 /// \param __b
597 ///    A 128-bit vector of [2 x double].
598 /// \returns A 128-bit vector containing the comparison results.
599 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
600                                                            __m128d __b) {
601   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
602 }
603 
604 /// Compares each of the corresponding double-precision values of the
605 ///    128-bit vectors of [2 x double] to determine if the values in the first
606 ///    operand are not greater than those in the second operand.
607 ///
608 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
609 ///
610 /// \headerfile <x86intrin.h>
611 ///
612 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
613 ///
614 /// \param __a
615 ///    A 128-bit vector of [2 x double].
616 /// \param __b
617 ///    A 128-bit vector of [2 x double].
618 /// \returns A 128-bit vector containing the comparison results.
619 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
620                                                            __m128d __b) {
621   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
622 }
623 
624 /// Compares each of the corresponding double-precision values of the
625 ///    128-bit vectors of [2 x double] to determine if the values in the first
626 ///    operand are not greater than or equal to those in the second operand.
627 ///
628 ///    Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
629 ///
630 /// \headerfile <x86intrin.h>
631 ///
632 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
633 ///
634 /// \param __a
635 ///    A 128-bit vector of [2 x double].
636 /// \param __b
637 ///    A 128-bit vector of [2 x double].
638 /// \returns A 128-bit vector containing the comparison results.
639 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
640                                                            __m128d __b) {
641   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
642 }
643 
644 /// Compares the lower double-precision floating-point values in each of
645 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
646 ///
647 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
648 ///
649 /// \headerfile <x86intrin.h>
650 ///
651 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
652 ///
653 /// \param __a
654 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
655 ///    compared to the lower double-precision value of \a __b.
656 /// \param __b
657 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
658 ///    compared to the lower double-precision value of \a __a.
659 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
660 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
662                                                           __m128d __b) {
663   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
664 }
665 
666 /// Compares the lower double-precision floating-point values in each of
667 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
668 ///    the value in the first parameter is less than the corresponding value in
669 ///    the second parameter.
670 ///
671 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
672 ///
673 /// \headerfile <x86intrin.h>
674 ///
675 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
676 ///
677 /// \param __a
678 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
679 ///    compared to the lower double-precision value of \a __b.
680 /// \param __b
681 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
682 ///    compared to the lower double-precision value of \a __a.
683 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
684 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
685 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
686                                                           __m128d __b) {
687   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
688 }
689 
690 /// Compares the lower double-precision floating-point values in each of
691 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
692 ///    the value in the first parameter is less than or equal to the
693 ///    corresponding value in the second parameter.
694 ///
695 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
696 ///
697 /// \headerfile <x86intrin.h>
698 ///
699 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
700 ///
701 /// \param __a
702 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
703 ///    compared to the lower double-precision value of \a __b.
704 /// \param __b
705 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
706 ///    compared to the lower double-precision value of \a __a.
707 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
708 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
710                                                           __m128d __b) {
711   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
712 }
713 
714 /// Compares the lower double-precision floating-point values in each of
715 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
716 ///    the value in the first parameter is greater than the corresponding value
717 ///    in the second parameter.
718 ///
719 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
720 ///
721 /// \headerfile <x86intrin.h>
722 ///
723 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
724 ///
725 /// \param __a
726 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
727 ///     compared to the lower double-precision value of \a __b.
728 /// \param __b
729 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
730 ///     compared to the lower double-precision value of \a __a.
731 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
732 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
733 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
734                                                           __m128d __b) {
735   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
736   return __extension__(__m128d){__c[0], __a[1]};
737 }
738 
739 /// Compares the lower double-precision floating-point values in each of
740 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
741 ///    the value in the first parameter is greater than or equal to the
742 ///    corresponding value in the second parameter.
743 ///
744 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
745 ///
746 /// \headerfile <x86intrin.h>
747 ///
748 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
749 ///
750 /// \param __a
751 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
752 ///    compared to the lower double-precision value of \a __b.
753 /// \param __b
754 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
755 ///    compared to the lower double-precision value of \a __a.
756 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
757 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
758 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
759                                                           __m128d __b) {
760   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
761   return __extension__(__m128d){__c[0], __a[1]};
762 }
763 
764 /// Compares the lower double-precision floating-point values in each of
765 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
766 ///    the value in the first parameter is "ordered" with respect to the
767 ///    corresponding value in the second parameter.
768 ///
769 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
770 ///    of double-precision values are "ordered" with respect to each other if
771 ///    neither value is a NaN.
772 ///
773 /// \headerfile <x86intrin.h>
774 ///
775 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
776 ///
777 /// \param __a
778 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
779 ///    compared to the lower double-precision value of \a __b.
780 /// \param __b
781 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
782 ///    compared to the lower double-precision value of \a __a.
783 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
784 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
785 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
786                                                            __m128d __b) {
787   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
788 }
789 
790 /// Compares the lower double-precision floating-point values in each of
791 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
792 ///    the value in the first parameter is "unordered" with respect to the
793 ///    corresponding value in the second parameter.
794 ///
795 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
796 ///    of double-precision values are "unordered" with respect to each other if
797 ///    one or both values are NaN.
798 ///
799 /// \headerfile <x86intrin.h>
800 ///
801 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
802 ///   instruction.
803 ///
804 /// \param __a
805 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
806 ///    compared to the lower double-precision value of \a __b.
807 /// \param __b
808 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
809 ///    compared to the lower double-precision value of \a __a.
810 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
811 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
812 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
813                                                              __m128d __b) {
814   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
815 }
816 
817 /// Compares the lower double-precision floating-point values in each of
818 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
819 ///    the value in the first parameter is unequal to the corresponding value in
820 ///    the second parameter.
821 ///
822 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
823 ///
824 /// \headerfile <x86intrin.h>
825 ///
826 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
827 ///
828 /// \param __a
829 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
830 ///    compared to the lower double-precision value of \a __b.
831 /// \param __b
832 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
833 ///    compared to the lower double-precision value of \a __a.
834 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
835 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
836 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
837                                                            __m128d __b) {
838   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
839 }
840 
841 /// Compares the lower double-precision floating-point values in each of
842 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
843 ///    the value in the first parameter is not less than the corresponding
844 ///    value in the second parameter.
845 ///
846 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
847 ///
848 /// \headerfile <x86intrin.h>
849 ///
850 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
851 ///
852 /// \param __a
853 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
854 ///    compared to the lower double-precision value of \a __b.
855 /// \param __b
856 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
857 ///    compared to the lower double-precision value of \a __a.
858 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
859 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
860 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
861                                                            __m128d __b) {
862   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
863 }
864 
865 /// Compares the lower double-precision floating-point values in each of
866 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
867 ///    the value in the first parameter is not less than or equal to the
868 ///    corresponding value in the second parameter.
869 ///
870 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
871 ///
872 /// \headerfile <x86intrin.h>
873 ///
874 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
875 ///
876 /// \param __a
877 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
878 ///    compared to the lower double-precision value of \a __b.
879 /// \param __b
880 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
881 ///    compared to the lower double-precision value of \a __a.
882 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
883 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
884 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
885                                                            __m128d __b) {
886   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
887 }
888 
889 /// Compares the lower double-precision floating-point values in each of
890 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
891 ///    the value in the first parameter is not greater than the corresponding
892 ///    value in the second parameter.
893 ///
894 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
895 ///
896 /// \headerfile <x86intrin.h>
897 ///
898 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
899 ///
900 /// \param __a
901 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
902 ///    compared to the lower double-precision value of \a __b.
903 /// \param __b
904 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
905 ///    compared to the lower double-precision value of \a __a.
906 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
907 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
908 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
909                                                            __m128d __b) {
910   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
911   return __extension__(__m128d){__c[0], __a[1]};
912 }
913 
914 /// Compares the lower double-precision floating-point values in each of
915 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
916 ///    the value in the first parameter is not greater than or equal to the
917 ///    corresponding value in the second parameter.
918 ///
919 ///    The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
920 ///
921 /// \headerfile <x86intrin.h>
922 ///
923 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
924 ///
925 /// \param __a
926 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
927 ///    compared to the lower double-precision value of \a __b.
928 /// \param __b
929 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
930 ///    compared to the lower double-precision value of \a __a.
931 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
932 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
933 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
934                                                            __m128d __b) {
935   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
936   return __extension__(__m128d){__c[0], __a[1]};
937 }
938 
939 /// Compares the lower double-precision floating-point values in each of
940 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
941 ///
942 ///    The comparison yields 0 for false, 1 for true. If either of the two
943 ///    lower double-precision values is NaN, 0 is returned.
944 ///
945 /// \headerfile <x86intrin.h>
946 ///
947 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
948 ///
949 /// \param __a
950 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
951 ///    compared to the lower double-precision value of \a __b.
952 /// \param __b
953 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
954 ///    compared to the lower double-precision value of \a __a.
955 /// \returns An integer containing the comparison results. If either of the two
956 ///    lower double-precision values is NaN, 0 is returned.
957 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
958                                                        __m128d __b) {
959   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
960 }
961 
962 /// Compares the lower double-precision floating-point values in each of
963 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
964 ///    the value in the first parameter is less than the corresponding value in
965 ///    the second parameter.
966 ///
967 ///    The comparison yields 0 for false, 1 for true. If either of the two
968 ///    lower double-precision values is NaN, 0 is returned.
969 ///
970 /// \headerfile <x86intrin.h>
971 ///
972 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
973 ///
974 /// \param __a
975 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
976 ///    compared to the lower double-precision value of \a __b.
977 /// \param __b
978 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
979 ///    compared to the lower double-precision value of \a __a.
980 /// \returns An integer containing the comparison results. If either of the two
981 ///     lower double-precision values is NaN, 0 is returned.
982 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
983                                                        __m128d __b) {
984   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
985 }
986 
987 /// Compares the lower double-precision floating-point values in each of
988 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
989 ///    the value in the first parameter is less than or equal to the
990 ///    corresponding value in the second parameter.
991 ///
992 ///    The comparison yields 0 for false, 1 for true. If either of the two
993 ///    lower double-precision values is NaN, 0 is returned.
994 ///
995 /// \headerfile <x86intrin.h>
996 ///
997 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
998 ///
999 /// \param __a
1000 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1001 ///    compared to the lower double-precision value of \a __b.
1002 /// \param __b
1003 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1004 ///     compared to the lower double-precision value of \a __a.
1005 /// \returns An integer containing the comparison results. If either of the two
1006 ///     lower double-precision values is NaN, 0 is returned.
1007 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1008                                                        __m128d __b) {
1009   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1010 }
1011 
1012 /// Compares the lower double-precision floating-point values in each of
1013 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1014 ///    the value in the first parameter is greater than the corresponding value
1015 ///    in the second parameter.
1016 ///
1017 ///    The comparison yields 0 for false, 1 for true. If either of the two
1018 ///    lower double-precision values is NaN, 0 is returned.
1019 ///
1020 /// \headerfile <x86intrin.h>
1021 ///
1022 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1023 ///
1024 /// \param __a
1025 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1026 ///    compared to the lower double-precision value of \a __b.
1027 /// \param __b
1028 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1029 ///    compared to the lower double-precision value of \a __a.
1030 /// \returns An integer containing the comparison results. If either of the two
1031 ///     lower double-precision values is NaN, 0 is returned.
1032 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1033                                                        __m128d __b) {
1034   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1035 }
1036 
1037 /// Compares the lower double-precision floating-point values in each of
1038 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1039 ///    the value in the first parameter is greater than or equal to the
1040 ///    corresponding value in the second parameter.
1041 ///
1042 ///    The comparison yields 0 for false, 1 for true. If either of the two
1043 ///    lower double-precision values is NaN, 0 is returned.
1044 ///
1045 /// \headerfile <x86intrin.h>
1046 ///
1047 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1048 ///
1049 /// \param __a
1050 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1051 ///    compared to the lower double-precision value of \a __b.
1052 /// \param __b
1053 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1054 ///    compared to the lower double-precision value of \a __a.
1055 /// \returns An integer containing the comparison results. If either of the two
1056 ///    lower double-precision values is NaN, 0 is returned.
1057 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1058                                                        __m128d __b) {
1059   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1060 }
1061 
1062 /// Compares the lower double-precision floating-point values in each of
1063 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1064 ///    the value in the first parameter is unequal to the corresponding value in
1065 ///    the second parameter.
1066 ///
1067 ///    The comparison yields 0 for false, 1 for true. If either of the two
1068 ///    lower double-precision values is NaN, 1 is returned.
1069 ///
1070 /// \headerfile <x86intrin.h>
1071 ///
1072 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1073 ///
1074 /// \param __a
1075 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1076 ///    compared to the lower double-precision value of \a __b.
1077 /// \param __b
1078 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1079 ///    compared to the lower double-precision value of \a __a.
1080 /// \returns An integer containing the comparison results. If either of the two
1081 ///     lower double-precision values is NaN, 1 is returned.
1082 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1083                                                         __m128d __b) {
1084   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1085 }
1086 
1087 /// Compares the lower double-precision floating-point values in each of
1088 ///    the two 128-bit floating-point vectors of [2 x double] for equality. The
1089 ///    comparison yields 0 for false, 1 for true.
1090 ///
1091 ///    If either of the two lower double-precision values is NaN, 0 is returned.
1092 ///
1093 /// \headerfile <x86intrin.h>
1094 ///
1095 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1096 ///
1097 /// \param __a
1098 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1099 ///    compared to the lower double-precision value of \a __b.
1100 /// \param __b
1101 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1102 ///    compared to the lower double-precision value of \a __a.
1103 /// \returns An integer containing the comparison results. If either of the two
1104 ///    lower double-precision values is NaN, 0 is returned.
1105 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1106                                                         __m128d __b) {
1107   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1108 }
1109 
1110 /// Compares the lower double-precision floating-point values in each of
1111 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1112 ///    the value in the first parameter is less than the corresponding value in
1113 ///    the second parameter.
1114 ///
1115 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1116 ///    double-precision values is NaN, 0 is returned.
1117 ///
1118 /// \headerfile <x86intrin.h>
1119 ///
1120 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1121 ///
1122 /// \param __a
1123 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1124 ///    compared to the lower double-precision value of \a __b.
1125 /// \param __b
1126 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1127 ///    compared to the lower double-precision value of \a __a.
1128 /// \returns An integer containing the comparison results. If either of the two
1129 ///    lower double-precision values is NaN, 0 is returned.
1130 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1131                                                         __m128d __b) {
1132   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1133 }
1134 
1135 /// Compares the lower double-precision floating-point values in each of
1136 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1137 ///    the value in the first parameter is less than or equal to the
1138 ///    corresponding value in the second parameter.
1139 ///
1140 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1141 ///    double-precision values is NaN, 0 is returned.
1142 ///
1143 /// \headerfile <x86intrin.h>
1144 ///
1145 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1146 ///
1147 /// \param __a
1148 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1149 ///    compared to the lower double-precision value of \a __b.
1150 /// \param __b
1151 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1152 ///     compared to the lower double-precision value of \a __a.
1153 /// \returns An integer containing the comparison results. If either of the two
1154 ///     lower double-precision values is NaN, 0 is returned.
1155 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1156                                                         __m128d __b) {
1157   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1158 }
1159 
1160 /// Compares the lower double-precision floating-point values in each of
1161 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1162 ///    the value in the first parameter is greater than the corresponding value
1163 ///    in the second parameter.
1164 ///
1165 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1166 ///    double-precision values is NaN, 0 is returned.
1167 ///
1168 /// \headerfile <x86intrin.h>
1169 ///
1170 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1171 ///
1172 /// \param __a
1173 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1174 ///    compared to the lower double-precision value of \a __b.
1175 /// \param __b
1176 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1177 ///     compared to the lower double-precision value of \a __a.
1178 /// \returns An integer containing the comparison results. If either of the two
1179 ///     lower double-precision values is NaN, 0 is returned.
1180 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1181                                                         __m128d __b) {
1182   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1183 }
1184 
1185 /// Compares the lower double-precision floating-point values in each of
1186 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1187 ///    the value in the first parameter is greater than or equal to the
1188 ///    corresponding value in the second parameter.
1189 ///
1190 ///    The comparison yields 0 for false, 1 for true.  If either of the two
1191 ///    lower double-precision values is NaN, 0 is returned.
1192 ///
1193 /// \headerfile <x86intrin.h>
1194 ///
1195 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1196 ///
1197 /// \param __a
1198 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1199 ///    compared to the lower double-precision value of \a __b.
1200 /// \param __b
1201 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1202 ///    compared to the lower double-precision value of \a __a.
1203 /// \returns An integer containing the comparison results. If either of the two
1204 ///    lower double-precision values is NaN, 0 is returned.
1205 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1206                                                         __m128d __b) {
1207   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1208 }
1209 
1210 /// Compares the lower double-precision floating-point values in each of
1211 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1212 ///    the value in the first parameter is unequal to the corresponding value in
1213 ///    the second parameter.
1214 ///
1215 ///    The comparison yields 0 for false, 1 for true. If either of the two lower
1216 ///    double-precision values is NaN, 1 is returned.
1217 ///
1218 /// \headerfile <x86intrin.h>
1219 ///
1220 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1221 ///
1222 /// \param __a
1223 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1224 ///    compared to the lower double-precision value of \a __b.
1225 /// \param __b
1226 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1227 ///    compared to the lower double-precision value of \a __a.
1228 /// \returns An integer containing the comparison result. If either of the two
1229 ///    lower double-precision values is NaN, 1 is returned.
1230 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1231                                                          __m128d __b) {
1232   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1233 }
1234 
1235 /// Converts the two double-precision floating-point elements of a
1236 ///    128-bit vector of [2 x double] into two single-precision floating-point
1237 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1238 ///    The upper 64 bits of the result vector are set to zero.
1239 ///
1240 /// \headerfile <x86intrin.h>
1241 ///
1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1243 ///
1244 /// \param __a
1245 ///    A 128-bit vector of [2 x double].
1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1247 ///    converted values. The upper 64 bits are set to zero.
1248 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1249   return __builtin_ia32_cvtpd2ps((__v2df)__a);
1250 }
1251 
1252 /// Converts the lower two single-precision floating-point elements of a
1253 ///    128-bit vector of [4 x float] into two double-precision floating-point
1254 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
1255 ///    elements of the input vector are unused.
1256 ///
1257 /// \headerfile <x86intrin.h>
1258 ///
1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1260 ///
1261 /// \param __a
1262 ///    A 128-bit vector of [4 x float]. The lower two single-precision
1263 ///    floating-point elements are converted to double-precision values. The
1264 ///    upper two elements are unused.
1265 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1266 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1267   return (__m128d) __builtin_convertvector(
1268       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1269 }
1270 
1271 /// Converts the lower two integer elements of a 128-bit vector of
1272 ///    [4 x i32] into two double-precision floating-point values, returned in a
1273 ///    128-bit vector of [2 x double].
1274 ///
1275 ///    The upper two elements of the input vector are unused.
1276 ///
1277 /// \headerfile <x86intrin.h>
1278 ///
1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1280 ///
1281 /// \param __a
1282 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1283 ///    converted to double-precision values.
1284 ///
1285 ///    The upper two elements are unused.
1286 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1287 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1288   return (__m128d) __builtin_convertvector(
1289       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1290 }
1291 
1292 /// Converts the two double-precision floating-point elements of a
1293 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1294 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1295 ///    64 bits of the result vector are set to zero.
1296 ///
1297 /// \headerfile <x86intrin.h>
1298 ///
1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1300 ///
1301 /// \param __a
1302 ///    A 128-bit vector of [2 x double].
1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1304 ///    converted values. The upper 64 bits are set to zero.
1305 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1306   return __builtin_ia32_cvtpd2dq((__v2df)__a);
1307 }
1308 
1309 /// Converts the low-order element of a 128-bit vector of [2 x double]
1310 ///    into a 32-bit signed integer value.
1311 ///
1312 /// \headerfile <x86intrin.h>
1313 ///
1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1315 ///
1316 /// \param __a
1317 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1318 ///    conversion.
1319 /// \returns A 32-bit signed integer containing the converted value.
1320 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1321   return __builtin_ia32_cvtsd2si((__v2df)__a);
1322 }
1323 
1324 /// Converts the lower double-precision floating-point element of a
1325 ///    128-bit vector of [2 x double], in the second parameter, into a
1326 ///    single-precision floating-point value, returned in the lower 32 bits of a
1327 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1328 ///    copied from the upper 96 bits of the first parameter.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1333 ///
1334 /// \param __a
1335 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1336 ///    copied to the upper 96 bits of the result.
1337 /// \param __b
1338 ///    A 128-bit vector of [2 x double]. The lower double-precision
1339 ///    floating-point element is used in the conversion.
1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1341 ///    converted value from the second parameter. The upper 96 bits are copied
1342 ///    from the upper 96 bits of the first parameter.
1343 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1344                                                          __m128d __b) {
1345   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1346 }
1347 
1348 /// Converts a 32-bit signed integer value, in the second parameter, into
1349 ///    a double-precision floating-point value, returned in the lower 64 bits of
1350 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1351 ///    are copied from the upper 64 bits of the first parameter.
1352 ///
1353 /// \headerfile <x86intrin.h>
1354 ///
1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1356 ///
1357 /// \param __a
1358 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1359 ///    copied to the upper 64 bits of the result.
1360 /// \param __b
1361 ///    A 32-bit signed integer containing the value to be converted.
1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1363 ///    converted value from the second parameter. The upper 64 bits are copied
1364 ///    from the upper 64 bits of the first parameter.
1365 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1366                                                             int __b) {
1367   __a[0] = __b;
1368   return __a;
1369 }
1370 
1371 /// Converts the lower single-precision floating-point element of a
1372 ///    128-bit vector of [4 x float], in the second parameter, into a
1373 ///    double-precision floating-point value, returned in the lower 64 bits of
1374 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1375 ///    are copied from the upper 64 bits of the first parameter.
1376 ///
1377 /// \headerfile <x86intrin.h>
1378 ///
1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1380 ///
1381 /// \param __a
1382 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1383 ///    copied to the upper 64 bits of the result.
1384 /// \param __b
1385 ///    A 128-bit vector of [4 x float]. The lower single-precision
1386 ///    floating-point element is used in the conversion.
1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1388 ///    converted value from the second parameter. The upper 64 bits are copied
1389 ///    from the upper 64 bits of the first parameter.
1390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1391                                                           __m128 __b) {
1392   __a[0] = __b[0];
1393   return __a;
1394 }
1395 
1396 /// Converts the two double-precision floating-point elements of a
1397 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1398 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32].
1399 ///
1400 ///    If the result of either conversion is inexact, the result is truncated
1401 ///    (rounded towards zero) regardless of the current MXCSR setting. The upper
1402 ///    64 bits of the result vector are set to zero.
1403 ///
1404 /// \headerfile <x86intrin.h>
1405 ///
1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1407 ///   instruction.
1408 ///
1409 /// \param __a
1410 ///    A 128-bit vector of [2 x double].
1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1412 ///    converted values. The upper 64 bits are set to zero.
1413 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1414   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1415 }
1416 
1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1418 ///    signed integer value, truncating the result when it is inexact.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1423 ///   instruction.
1424 ///
1425 /// \param __a
1426 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1427 ///    conversion.
1428 /// \returns A 32-bit signed integer containing the converted value.
1429 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1430   return __builtin_ia32_cvttsd2si((__v2df)__a);
1431 }
1432 
1433 /// Converts the two double-precision floating-point elements of a
1434 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1435 ///    returned in a 64-bit vector of [2 x i32].
1436 ///
1437 /// \headerfile <x86intrin.h>
1438 ///
1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1440 ///
1441 /// \param __a
1442 ///    A 128-bit vector of [2 x double].
1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1445   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1446 }
1447 
1448 /// Converts the two double-precision floating-point elements of a
1449 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1450 ///    returned in a 64-bit vector of [2 x i32].
1451 ///
1452 ///    If the result of either conversion is inexact, the result is truncated
1453 ///    (rounded towards zero) regardless of the current MXCSR setting.
1454 ///
1455 /// \headerfile <x86intrin.h>
1456 ///
1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1458 ///
1459 /// \param __a
1460 ///    A 128-bit vector of [2 x double].
1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1463   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1464 }
1465 
1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1467 ///    [2 x i32] into two double-precision floating-point values, returned in a
1468 ///    128-bit vector of [2 x double].
1469 ///
1470 /// \headerfile <x86intrin.h>
1471 ///
1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1473 ///
1474 /// \param __a
1475 ///    A 64-bit vector of [2 x i32].
1476 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1478   return __builtin_ia32_cvtpi2pd((__v2si)__a);
1479 }
1480 
1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1482 ///    a double-precision floating-point value.
1483 ///
1484 /// \headerfile <x86intrin.h>
1485 ///
1486 /// This intrinsic has no corresponding instruction.
1487 ///
1488 /// \param __a
1489 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1490 /// \returns A double-precision floating-point value copied from the lower 64
1491 ///    bits of \a __a.
1492 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1493   return __a[0];
1494 }
1495 
1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1497 ///    memory location.
1498 ///
1499 /// \headerfile <x86intrin.h>
1500 ///
1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1502 ///
1503 /// \param __dp
1504 ///    A pointer to a 128-bit memory location. The address of the memory
1505 ///    location has to be 16-byte aligned.
1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1508   return *(const __m128d *)__dp;
1509 }
1510 
1511 /// Loads a double-precision floating-point value from a specified memory
1512 ///    location and duplicates it to both vector elements of a 128-bit vector of
1513 ///    [2 x double].
1514 ///
1515 /// \headerfile <x86intrin.h>
1516 ///
1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1518 ///
1519 /// \param __dp
1520 ///    A pointer to a memory location containing a double-precision value.
1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1522 ///    duplicated values.
1523 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1524   struct __mm_load1_pd_struct {
1525     double __u;
1526   } __attribute__((__packed__, __may_alias__));
1527   double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1528   return __extension__(__m128d){__u, __u};
1529 }
1530 
1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1532 
1533 /// Loads two double-precision values, in reverse order, from an aligned
1534 ///    memory location into a 128-bit vector of [2 x double].
1535 ///
1536 /// \headerfile <x86intrin.h>
1537 ///
1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1541 ///
1542 /// \param __dp
1543 ///    A 16-byte aligned pointer to an array of double-precision values to be
1544 ///    loaded in reverse order.
1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1546 ///    values.
1547 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1548   __m128d __u = *(const __m128d *)__dp;
1549   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1550 }
1551 
1552 /// Loads a 128-bit floating-point vector of [2 x double] from an
1553 ///    unaligned memory location.
1554 ///
1555 /// \headerfile <x86intrin.h>
1556 ///
1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1558 ///
1559 /// \param __dp
1560 ///    A pointer to a 128-bit memory location. The address of the memory
1561 ///    location does not have to be aligned.
1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1563 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1564   struct __loadu_pd {
1565     __m128d_u __v;
1566   } __attribute__((__packed__, __may_alias__));
1567   return ((const struct __loadu_pd *)__dp)->__v;
1568 }
1569 
1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1571 ///    vector and clears the upper element.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1576 ///
1577 /// \param __a
1578 ///    A pointer to a 64-bit memory location. The address of the memory
1579 ///    location does not have to be aligned.
1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1582   struct __loadu_si64 {
1583     long long __v;
1584   } __attribute__((__packed__, __may_alias__));
1585   long long __u = ((const struct __loadu_si64 *)__a)->__v;
1586   return __extension__(__m128i)(__v2di){__u, 0LL};
1587 }
1588 
1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1590 ///    vector and clears the upper element.
1591 ///
1592 /// \headerfile <x86intrin.h>
1593 ///
1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1595 ///
1596 /// \param __a
1597 ///    A pointer to a 32-bit memory location. The address of the memory
1598 ///    location does not have to be aligned.
1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1600 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1601   struct __loadu_si32 {
1602     int __v;
1603   } __attribute__((__packed__, __may_alias__));
1604   int __u = ((const struct __loadu_si32 *)__a)->__v;
1605   return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1606 }
1607 
1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1609 ///    vector and clears the upper element.
1610 ///
1611 /// \headerfile <x86intrin.h>
1612 ///
1613 /// This intrinsic does not correspond to a specific instruction.
1614 ///
1615 /// \param __a
1616 ///    A pointer to a 16-bit memory location. The address of the memory
1617 ///    location does not have to be aligned.
1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1619 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1620   struct __loadu_si16 {
1621     short __v;
1622   } __attribute__((__packed__, __may_alias__));
1623   short __u = ((const struct __loadu_si16 *)__a)->__v;
1624   return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1625 }
1626 
1627 /// Loads a 64-bit double-precision value to the low element of a
1628 ///    128-bit integer vector and clears the upper element.
1629 ///
1630 /// \headerfile <x86intrin.h>
1631 ///
1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1633 ///
1634 /// \param __dp
1635 ///    A pointer to a memory location containing a double-precision value.
1636 ///    The address of the memory location does not have to be aligned.
1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1638 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1639   struct __mm_load_sd_struct {
1640     double __u;
1641   } __attribute__((__packed__, __may_alias__));
1642   double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1643   return __extension__(__m128d){__u, 0};
1644 }
1645 
1646 /// Loads a double-precision value into the high-order bits of a 128-bit
1647 ///    vector of [2 x double]. The low-order bits are copied from the low-order
1648 ///    bits of the first operand.
1649 ///
1650 /// \headerfile <x86intrin.h>
1651 ///
1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1653 ///
1654 /// \param __a
1655 ///    A 128-bit vector of [2 x double]. \n
1656 ///    Bits [63:0] are written to bits [63:0] of the result.
1657 /// \param __dp
1658 ///    A pointer to a 64-bit memory location containing a double-precision
1659 ///    floating-point value that is loaded. The loaded value is written to bits
1660 ///    [127:64] of the result. The address of the memory location does not have
1661 ///    to be aligned.
1662 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1664                                                           double const *__dp) {
1665   struct __mm_loadh_pd_struct {
1666     double __u;
1667   } __attribute__((__packed__, __may_alias__));
1668   double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1669   return __extension__(__m128d){__a[0], __u};
1670 }
1671 
1672 /// Loads a double-precision value into the low-order bits of a 128-bit
1673 ///    vector of [2 x double]. The high-order bits are copied from the
1674 ///    high-order bits of the first operand.
1675 ///
1676 /// \headerfile <x86intrin.h>
1677 ///
1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1679 ///
1680 /// \param __a
1681 ///    A 128-bit vector of [2 x double]. \n
1682 ///    Bits [127:64] are written to bits [127:64] of the result.
1683 /// \param __dp
1684 ///    A pointer to a 64-bit memory location containing a double-precision
1685 ///    floating-point value that is loaded. The loaded value is written to bits
1686 ///    [63:0] of the result. The address of the memory location does not have to
1687 ///    be aligned.
1688 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1689 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1690                                                           double const *__dp) {
1691   struct __mm_loadl_pd_struct {
1692     double __u;
1693   } __attribute__((__packed__, __may_alias__));
1694   double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1695   return __extension__(__m128d){__u, __a[1]};
1696 }
1697 
1698 /// Constructs a 128-bit floating-point vector of [2 x double] with
1699 ///    unspecified content. This could be used as an argument to another
1700 ///    intrinsic function where the argument is required but the value is not
1701 ///    actually used.
1702 ///
1703 /// \headerfile <x86intrin.h>
1704 ///
1705 /// This intrinsic has no corresponding instruction.
1706 ///
1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1708 ///    content.
1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1710   return (__m128d)__builtin_ia32_undef128();
1711 }
1712 
1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1714 ///    64 bits of the vector are initialized with the specified double-precision
1715 ///    floating-point value. The upper 64 bits are set to zero.
1716 ///
1717 /// \headerfile <x86intrin.h>
1718 ///
1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1720 ///
1721 /// \param __w
1722 ///    A double-precision floating-point value used to initialize the lower 64
1723 ///    bits of the result.
1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1725 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1726 ///    set to zero.
1727 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1728   return __extension__(__m128d){__w, 0};
1729 }
1730 
1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1732 ///    of the two double-precision floating-point vector elements set to the
1733 ///    specified double-precision floating-point value.
1734 ///
1735 /// \headerfile <x86intrin.h>
1736 ///
1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1738 ///
1739 /// \param __w
1740 ///    A double-precision floating-point value used to initialize each vector
1741 ///    element of the result.
1742 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1744   return __extension__(__m128d){__w, __w};
1745 }
1746 
1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1748 ///    of the two double-precision floating-point vector elements set to the
1749 ///    specified double-precision floating-point value.
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1754 ///
1755 /// \param __w
1756 ///    A double-precision floating-point value used to initialize each vector
1757 ///    element of the result.
1758 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1759 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1760   return _mm_set1_pd(__w);
1761 }
1762 
1763 /// Constructs a 128-bit floating-point vector of [2 x double]
1764 ///    initialized with the specified double-precision floating-point values.
1765 ///
1766 /// \headerfile <x86intrin.h>
1767 ///
1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1769 ///
1770 /// \param __w
1771 ///    A double-precision floating-point value used to initialize the upper 64
1772 ///    bits of the result.
1773 /// \param __x
1774 ///    A double-precision floating-point value used to initialize the lower 64
1775 ///    bits of the result.
1776 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1777 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1778                                                         double __x) {
1779   return __extension__(__m128d){__x, __w};
1780 }
1781 
1782 /// Constructs a 128-bit floating-point vector of [2 x double],
1783 ///    initialized in reverse order with the specified double-precision
1784 ///    floating-point values.
1785 ///
1786 /// \headerfile <x86intrin.h>
1787 ///
1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1789 ///
1790 /// \param __w
1791 ///    A double-precision floating-point value used to initialize the lower 64
1792 ///    bits of the result.
1793 /// \param __x
1794 ///    A double-precision floating-point value used to initialize the upper 64
1795 ///    bits of the result.
1796 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1798                                                          double __x) {
1799   return __extension__(__m128d){__w, __x};
1800 }
1801 
1802 /// Constructs a 128-bit floating-point vector of [2 x double]
1803 ///    initialized to zero.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1808 ///
1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1810 ///    all elements set to zero.
1811 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1812   return __extension__(__m128d){0, 0};
1813 }
1814 
1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1816 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
1817 ///    64 bits are set to the upper 64 bits of the first parameter.
1818 ///
1819 /// \headerfile <x86intrin.h>
1820 ///
1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1822 ///
1823 /// \param __a
1824 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1825 ///    upper 64 bits of the result.
1826 /// \param __b
1827 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1828 ///    lower 64 bits of the result.
1829 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1830 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1831                                                          __m128d __b) {
1832   __a[0] = __b[0];
1833   return __a;
1834 }
1835 
1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1837 ///    memory location.
1838 ///
1839 /// \headerfile <x86intrin.h>
1840 ///
1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1842 ///
1843 /// \param __dp
1844 ///    A pointer to a 64-bit memory location.
1845 /// \param __a
1846 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1847 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1848                                                        __m128d __a) {
1849   struct __mm_store_sd_struct {
1850     double __u;
1851   } __attribute__((__packed__, __may_alias__));
1852   ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1853 }
1854 
1855 /// Moves packed double-precision values from a 128-bit vector of
1856 ///    [2 x double] to a memory location.
1857 ///
1858 /// \headerfile <x86intrin.h>
1859 ///
1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1861 ///
1862 /// \param __dp
1863 ///    A pointer to an aligned memory location that can store two
1864 ///    double-precision values.
1865 /// \param __a
1866 ///    A packed 128-bit vector of [2 x double] containing the values to be
1867 ///    moved.
1868 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1869                                                        __m128d __a) {
1870   *(__m128d *)__dp = __a;
1871 }
1872 
1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1874 ///    the upper and lower 64 bits of a memory location.
1875 ///
1876 /// \headerfile <x86intrin.h>
1877 ///
1878 /// This intrinsic corresponds to the
1879 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1880 ///
1881 /// \param __dp
1882 ///    A pointer to a memory location that can store two double-precision
1883 ///    values.
1884 /// \param __a
1885 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1886 ///    of the values in \a __dp.
1887 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1888                                                         __m128d __a) {
1889   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1890   _mm_store_pd(__dp, __a);
1891 }
1892 
1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1894 ///    the upper and lower 64 bits of a memory location.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the
1899 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1900 ///
1901 /// \param __dp
1902 ///    A pointer to a memory location that can store two double-precision
1903 ///    values.
1904 /// \param __a
1905 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1906 ///    of the values in \a __dp.
1907 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1908                                                         __m128d __a) {
1909   _mm_store1_pd(__dp, __a);
1910 }
1911 
1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1913 ///    location.
1914 ///
1915 /// \headerfile <x86intrin.h>
1916 ///
1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1918 ///
1919 /// \param __dp
1920 ///    A pointer to a 128-bit memory location. The address of the memory
1921 ///    location does not have to be aligned.
1922 /// \param __a
1923 ///    A 128-bit vector of [2 x double] containing the values to be stored.
1924 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1925                                                         __m128d __a) {
1926   struct __storeu_pd {
1927     __m128d_u __v;
1928   } __attribute__((__packed__, __may_alias__));
1929   ((struct __storeu_pd *)__dp)->__v = __a;
1930 }
1931 
1932 /// Stores two double-precision values, in reverse order, from a 128-bit
1933 ///    vector of [2 x double] to a 16-byte aligned memory location.
1934 ///
1935 /// \headerfile <x86intrin.h>
1936 ///
1937 /// This intrinsic corresponds to a shuffling instruction followed by a
1938 /// <c> VMOVAPD / MOVAPD </c> instruction.
1939 ///
1940 /// \param __dp
1941 ///    A pointer to a 16-byte aligned memory location that can store two
1942 ///    double-precision values.
1943 /// \param __a
1944 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
1945 ///    stored.
1946 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1947                                                         __m128d __a) {
1948   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1949   *(__m128d *)__dp = __a;
1950 }
1951 
1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1953 ///    memory location.
1954 ///
1955 /// \headerfile <x86intrin.h>
1956 ///
1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1958 ///
1959 /// \param __dp
1960 ///    A pointer to a 64-bit memory location.
1961 /// \param __a
1962 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1963 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
1964                                                         __m128d __a) {
1965   struct __mm_storeh_pd_struct {
1966     double __u;
1967   } __attribute__((__packed__, __may_alias__));
1968   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
1969 }
1970 
1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1972 ///    memory location.
1973 ///
1974 /// \headerfile <x86intrin.h>
1975 ///
1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1977 ///
1978 /// \param __dp
1979 ///    A pointer to a 64-bit memory location.
1980 /// \param __a
1981 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
1983                                                         __m128d __a) {
1984   struct __mm_storeh_pd_struct {
1985     double __u;
1986   } __attribute__((__packed__, __may_alias__));
1987   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
1988 }
1989 
1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
1991 ///    saving the lower 8 bits of each sum in the corresponding element of a
1992 ///    128-bit result vector of [16 x i8].
1993 ///
1994 ///    The integer elements of both parameters can be either signed or unsigned.
1995 ///
1996 /// \headerfile <x86intrin.h>
1997 ///
1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
1999 ///
2000 /// \param __a
2001 ///    A 128-bit vector of [16 x i8].
2002 /// \param __b
2003 ///    A 128-bit vector of [16 x i8].
2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2005 ///    parameters.
2006 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2007                                                           __m128i __b) {
2008   return (__m128i)((__v16qu)__a + (__v16qu)__b);
2009 }
2010 
2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2012 ///    saving the lower 16 bits of each sum in the corresponding element of a
2013 ///    128-bit result vector of [8 x i16].
2014 ///
2015 ///    The integer elements of both parameters can be either signed or unsigned.
2016 ///
2017 /// \headerfile <x86intrin.h>
2018 ///
2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2020 ///
2021 /// \param __a
2022 ///    A 128-bit vector of [8 x i16].
2023 /// \param __b
2024 ///    A 128-bit vector of [8 x i16].
2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2026 ///    parameters.
2027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2028                                                            __m128i __b) {
2029   return (__m128i)((__v8hu)__a + (__v8hu)__b);
2030 }
2031 
2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2033 ///    saving the lower 32 bits of each sum in the corresponding element of a
2034 ///    128-bit result vector of [4 x i32].
2035 ///
2036 ///    The integer elements of both parameters can be either signed or unsigned.
2037 ///
2038 /// \headerfile <x86intrin.h>
2039 ///
2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2041 ///
2042 /// \param __a
2043 ///    A 128-bit vector of [4 x i32].
2044 /// \param __b
2045 ///    A 128-bit vector of [4 x i32].
2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2047 ///    parameters.
2048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2049                                                            __m128i __b) {
2050   return (__m128i)((__v4su)__a + (__v4su)__b);
2051 }
2052 
2053 /// Adds two signed or unsigned 64-bit integer values, returning the
2054 ///    lower 64 bits of the sum.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2059 ///
2060 /// \param __a
2061 ///    A 64-bit integer.
2062 /// \param __b
2063 ///    A 64-bit integer.
2064 /// \returns A 64-bit integer containing the sum of both parameters.
2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2066                                                             __m64 __b) {
2067   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2068 }
2069 
2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2071 ///    saving the lower 64 bits of each sum in the corresponding element of a
2072 ///    128-bit result vector of [2 x i64].
2073 ///
2074 ///    The integer elements of both parameters can be either signed or unsigned.
2075 ///
2076 /// \headerfile <x86intrin.h>
2077 ///
2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2079 ///
2080 /// \param __a
2081 ///    A 128-bit vector of [2 x i64].
2082 /// \param __b
2083 ///    A 128-bit vector of [2 x i64].
2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2085 ///    parameters.
2086 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2087                                                            __m128i __b) {
2088   return (__m128i)((__v2du)__a + (__v2du)__b);
2089 }
2090 
2091 /// Adds, with saturation, the corresponding elements of two 128-bit
2092 ///    signed [16 x i8] vectors, saving each sum in the corresponding element of
2093 ///    a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
2094 ///    saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
2095 ///
2096 /// \headerfile <x86intrin.h>
2097 ///
2098 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2099 ///
2100 /// \param __a
2101 ///    A 128-bit signed [16 x i8] vector.
2102 /// \param __b
2103 ///    A 128-bit signed [16 x i8] vector.
2104 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2105 ///    both parameters.
2106 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2107                                                            __m128i __b) {
2108   return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2109 }
2110 
2111 /// Adds, with saturation, the corresponding elements of two 128-bit
2112 ///    signed [8 x i16] vectors, saving each sum in the corresponding element of
2113 ///    a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
2114 ///    are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
2115 ///    0x8000.
2116 ///
2117 /// \headerfile <x86intrin.h>
2118 ///
2119 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2120 ///
2121 /// \param __a
2122 ///    A 128-bit signed [8 x i16] vector.
2123 /// \param __b
2124 ///    A 128-bit signed [8 x i16] vector.
2125 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2126 ///    both parameters.
2127 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2128                                                             __m128i __b) {
2129   return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2130 }
2131 
2132 /// Adds, with saturation, the corresponding elements of two 128-bit
2133 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2134 ///    of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
2135 ///    are saturated to 0xFF. Negative sums are saturated to 0x00.
2136 ///
2137 /// \headerfile <x86intrin.h>
2138 ///
2139 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2140 ///
2141 /// \param __a
2142 ///    A 128-bit unsigned [16 x i8] vector.
2143 /// \param __b
2144 ///    A 128-bit unsigned [16 x i8] vector.
2145 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2146 ///    of both parameters.
2147 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2148                                                            __m128i __b) {
2149   return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2150 }
2151 
2152 /// Adds, with saturation, the corresponding elements of two 128-bit
2153 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2154 ///    of a 128-bit result vector of [8 x i16]. Positive sums greater than
2155 ///    0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
2156 ///
2157 /// \headerfile <x86intrin.h>
2158 ///
2159 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2160 ///
2161 /// \param __a
2162 ///    A 128-bit unsigned [8 x i16] vector.
2163 /// \param __b
2164 ///    A 128-bit unsigned [8 x i16] vector.
2165 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2166 ///    of both parameters.
2167 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2168                                                             __m128i __b) {
2169   return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2170 }
2171 
2172 /// Computes the rounded averages of corresponding elements of two
2173 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
2174 ///    corresponding element of a 128-bit result vector of [16 x i8].
2175 ///
2176 /// \headerfile <x86intrin.h>
2177 ///
2178 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2179 ///
2180 /// \param __a
2181 ///    A 128-bit unsigned [16 x i8] vector.
2182 /// \param __b
2183 ///    A 128-bit unsigned [16 x i8] vector.
2184 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2185 ///    averages of both parameters.
2186 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2187                                                           __m128i __b) {
2188   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2189 }
2190 
2191 /// Computes the rounded averages of corresponding elements of two
2192 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
2193 ///    corresponding element of a 128-bit result vector of [8 x i16].
2194 ///
2195 /// \headerfile <x86intrin.h>
2196 ///
2197 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2198 ///
2199 /// \param __a
2200 ///    A 128-bit unsigned [8 x i16] vector.
2201 /// \param __b
2202 ///    A 128-bit unsigned [8 x i16] vector.
2203 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2204 ///    averages of both parameters.
2205 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2206                                                            __m128i __b) {
2207   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2208 }
2209 
2210 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2211 ///    vectors, producing eight intermediate 32-bit signed integer products, and
2212 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2213 ///    [4 x i32] vector.
2214 ///
2215 ///    For example, bits [15:0] of both parameters are multiplied producing a
2216 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
2217 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2218 ///    of the result.
2219 ///
2220 /// \headerfile <x86intrin.h>
2221 ///
2222 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2223 ///
2224 /// \param __a
2225 ///    A 128-bit signed [8 x i16] vector.
2226 /// \param __b
2227 ///    A 128-bit signed [8 x i16] vector.
2228 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2229 ///    of both parameters.
2230 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2231                                                             __m128i __b) {
2232   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2233 }
2234 
2235 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2236 ///    vectors, saving the greater value from each comparison in the
2237 ///    corresponding element of a 128-bit result vector of [8 x i16].
2238 ///
2239 /// \headerfile <x86intrin.h>
2240 ///
2241 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2242 ///
2243 /// \param __a
2244 ///    A 128-bit signed [8 x i16] vector.
2245 /// \param __b
2246 ///    A 128-bit signed [8 x i16] vector.
2247 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2248 ///    each comparison.
2249 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2250                                                            __m128i __b) {
2251   return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2252 }
2253 
2254 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2255 ///    vectors, saving the greater value from each comparison in the
2256 ///    corresponding element of a 128-bit result vector of [16 x i8].
2257 ///
2258 /// \headerfile <x86intrin.h>
2259 ///
2260 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2261 ///
2262 /// \param __a
2263 ///    A 128-bit unsigned [16 x i8] vector.
2264 /// \param __b
2265 ///    A 128-bit unsigned [16 x i8] vector.
2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2267 ///    each comparison.
2268 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2269                                                           __m128i __b) {
2270   return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2271 }
2272 
2273 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2274 ///    vectors, saving the smaller value from each comparison in the
2275 ///    corresponding element of a 128-bit result vector of [8 x i16].
2276 ///
2277 /// \headerfile <x86intrin.h>
2278 ///
2279 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2280 ///
2281 /// \param __a
2282 ///    A 128-bit signed [8 x i16] vector.
2283 /// \param __b
2284 ///    A 128-bit signed [8 x i16] vector.
2285 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2286 ///    each comparison.
2287 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2288                                                            __m128i __b) {
2289   return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2290 }
2291 
2292 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2293 ///    vectors, saving the smaller value from each comparison in the
2294 ///    corresponding element of a 128-bit result vector of [16 x i8].
2295 ///
2296 /// \headerfile <x86intrin.h>
2297 ///
2298 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2299 ///
2300 /// \param __a
2301 ///    A 128-bit unsigned [16 x i8] vector.
2302 /// \param __b
2303 ///    A 128-bit unsigned [16 x i8] vector.
2304 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2305 ///    each comparison.
2306 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2307                                                           __m128i __b) {
2308   return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2309 }
2310 
2311 /// Multiplies the corresponding elements of two signed [8 x i16]
2312 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2313 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2318 ///
2319 /// \param __a
2320 ///    A 128-bit signed [8 x i16] vector.
2321 /// \param __b
2322 ///    A 128-bit signed [8 x i16] vector.
2323 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2324 ///    each of the eight 32-bit products.
2325 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2326                                                              __m128i __b) {
2327   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2328 }
2329 
2330 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2331 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2332 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2333 ///
2334 /// \headerfile <x86intrin.h>
2335 ///
2336 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2337 ///
2338 /// \param __a
2339 ///    A 128-bit unsigned [8 x i16] vector.
2340 /// \param __b
2341 ///    A 128-bit unsigned [8 x i16] vector.
2342 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2343 ///    of each of the eight 32-bit products.
2344 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2345                                                              __m128i __b) {
2346   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2347 }
2348 
2349 /// Multiplies the corresponding elements of two signed [8 x i16]
2350 ///    vectors, saving the lower 16 bits of each 32-bit product in the
2351 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2352 ///
2353 /// \headerfile <x86intrin.h>
2354 ///
2355 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2356 ///
2357 /// \param __a
2358 ///    A 128-bit signed [8 x i16] vector.
2359 /// \param __b
2360 ///    A 128-bit signed [8 x i16] vector.
2361 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2362 ///    each of the eight 32-bit products.
2363 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2364                                                              __m128i __b) {
2365   return (__m128i)((__v8hu)__a * (__v8hu)__b);
2366 }
2367 
2368 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2369 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2370 ///    product.
2371 ///
2372 /// \headerfile <x86intrin.h>
2373 ///
2374 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2375 ///
2376 /// \param __a
2377 ///    A 64-bit integer containing one of the source operands.
2378 /// \param __b
2379 ///    A 64-bit integer containing one of the source operands.
2380 /// \returns A 64-bit integer vector containing the product of both operands.
2381 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2382                                                             __m64 __b) {
2383   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2384 }
2385 
2386 /// Multiplies 32-bit unsigned integer values contained in the lower
2387 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2388 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2389 ///
2390 /// \headerfile <x86intrin.h>
2391 ///
2392 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2393 ///
2394 /// \param __a
2395 ///    A [2 x i64] vector containing one of the source operands.
2396 /// \param __b
2397 ///    A [2 x i64] vector containing one of the source operands.
2398 /// \returns A [2 x i64] vector containing the product of both operands.
2399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2400                                                            __m128i __b) {
2401   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2402 }
2403 
2404 /// Computes the absolute differences of corresponding 8-bit integer
2405 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2406 ///    separately sums the second 8 absolute differences. Packs these two
2407 ///    unsigned 16-bit integer sums into the upper and lower elements of a
2408 ///    [2 x i64] vector.
2409 ///
2410 /// \headerfile <x86intrin.h>
2411 ///
2412 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2413 ///
2414 /// \param __a
2415 ///    A 128-bit integer vector containing one of the source operands.
2416 /// \param __b
2417 ///    A 128-bit integer vector containing one of the source operands.
2418 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2419 ///    differences between both operands.
2420 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2421                                                           __m128i __b) {
2422   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2423 }
2424 
2425 /// Subtracts the corresponding 8-bit integer values in the operands.
2426 ///
2427 /// \headerfile <x86intrin.h>
2428 ///
2429 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2430 ///
2431 /// \param __a
2432 ///    A 128-bit integer vector containing the minuends.
2433 /// \param __b
2434 ///    A 128-bit integer vector containing the subtrahends.
2435 /// \returns A 128-bit integer vector containing the differences of the values
2436 ///    in the operands.
2437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2438                                                           __m128i __b) {
2439   return (__m128i)((__v16qu)__a - (__v16qu)__b);
2440 }
2441 
2442 /// Subtracts the corresponding 16-bit integer values in the operands.
2443 ///
2444 /// \headerfile <x86intrin.h>
2445 ///
2446 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2447 ///
2448 /// \param __a
2449 ///    A 128-bit integer vector containing the minuends.
2450 /// \param __b
2451 ///    A 128-bit integer vector containing the subtrahends.
2452 /// \returns A 128-bit integer vector containing the differences of the values
2453 ///    in the operands.
2454 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2455                                                            __m128i __b) {
2456   return (__m128i)((__v8hu)__a - (__v8hu)__b);
2457 }
2458 
2459 /// Subtracts the corresponding 32-bit integer values in the operands.
2460 ///
2461 /// \headerfile <x86intrin.h>
2462 ///
2463 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2464 ///
2465 /// \param __a
2466 ///    A 128-bit integer vector containing the minuends.
2467 /// \param __b
2468 ///    A 128-bit integer vector containing the subtrahends.
2469 /// \returns A 128-bit integer vector containing the differences of the values
2470 ///    in the operands.
2471 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2472                                                            __m128i __b) {
2473   return (__m128i)((__v4su)__a - (__v4su)__b);
2474 }
2475 
2476 /// Subtracts signed or unsigned 64-bit integer values and writes the
2477 ///    difference to the corresponding bits in the destination.
2478 ///
2479 /// \headerfile <x86intrin.h>
2480 ///
2481 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2482 ///
2483 /// \param __a
2484 ///    A 64-bit integer vector containing the minuend.
2485 /// \param __b
2486 ///    A 64-bit integer vector containing the subtrahend.
2487 /// \returns A 64-bit integer vector containing the difference of the values in
2488 ///    the operands.
2489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2490                                                             __m64 __b) {
2491   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2492 }
2493 
2494 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2495 ///
2496 /// \headerfile <x86intrin.h>
2497 ///
2498 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2499 ///
2500 /// \param __a
2501 ///    A 128-bit integer vector containing the minuends.
2502 /// \param __b
2503 ///    A 128-bit integer vector containing the subtrahends.
2504 /// \returns A 128-bit integer vector containing the differences of the values
2505 ///    in the operands.
2506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2507                                                            __m128i __b) {
2508   return (__m128i)((__v2du)__a - (__v2du)__b);
2509 }
2510 
2511 /// Subtracts corresponding 8-bit signed integer values in the input and
2512 ///    returns the differences in the corresponding bytes in the destination.
2513 ///    Differences greater than 0x7F are saturated to 0x7F, and differences less
2514 ///    than 0x80 are saturated to 0x80.
2515 ///
2516 /// \headerfile <x86intrin.h>
2517 ///
2518 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2519 ///
2520 /// \param __a
2521 ///    A 128-bit integer vector containing the minuends.
2522 /// \param __b
2523 ///    A 128-bit integer vector containing the subtrahends.
2524 /// \returns A 128-bit integer vector containing the differences of the values
2525 ///    in the operands.
2526 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2527                                                            __m128i __b) {
2528   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2529 }
2530 
2531 /// Subtracts corresponding 16-bit signed integer values in the input and
2532 ///    returns the differences in the corresponding bytes in the destination.
2533 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2534 ///    than 0x8000 are saturated to 0x8000.
2535 ///
2536 /// \headerfile <x86intrin.h>
2537 ///
2538 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2539 ///
2540 /// \param __a
2541 ///    A 128-bit integer vector containing the minuends.
2542 /// \param __b
2543 ///    A 128-bit integer vector containing the subtrahends.
2544 /// \returns A 128-bit integer vector containing the differences of the values
2545 ///    in the operands.
2546 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2547                                                             __m128i __b) {
2548   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2549 }
2550 
2551 /// Subtracts corresponding 8-bit unsigned integer values in the input
2552 ///    and returns the differences in the corresponding bytes in the
2553 ///    destination. Differences less than 0x00 are saturated to 0x00.
2554 ///
2555 /// \headerfile <x86intrin.h>
2556 ///
2557 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2558 ///
2559 /// \param __a
2560 ///    A 128-bit integer vector containing the minuends.
2561 /// \param __b
2562 ///    A 128-bit integer vector containing the subtrahends.
2563 /// \returns A 128-bit integer vector containing the unsigned integer
2564 ///    differences of the values in the operands.
2565 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2566                                                            __m128i __b) {
2567   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2568 }
2569 
2570 /// Subtracts corresponding 16-bit unsigned integer values in the input
2571 ///    and returns the differences in the corresponding bytes in the
2572 ///    destination. Differences less than 0x0000 are saturated to 0x0000.
2573 ///
2574 /// \headerfile <x86intrin.h>
2575 ///
2576 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2577 ///
2578 /// \param __a
2579 ///    A 128-bit integer vector containing the minuends.
2580 /// \param __b
2581 ///    A 128-bit integer vector containing the subtrahends.
2582 /// \returns A 128-bit integer vector containing the unsigned integer
2583 ///    differences of the values in the operands.
2584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2585                                                             __m128i __b) {
2586   return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2587 }
2588 
2589 /// Performs a bitwise AND of two 128-bit integer vectors.
2590 ///
2591 /// \headerfile <x86intrin.h>
2592 ///
2593 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2594 ///
2595 /// \param __a
2596 ///    A 128-bit integer vector containing one of the source operands.
2597 /// \param __b
2598 ///    A 128-bit integer vector containing one of the source operands.
2599 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2600 ///    in both operands.
2601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2602                                                            __m128i __b) {
2603   return (__m128i)((__v2du)__a & (__v2du)__b);
2604 }
2605 
2606 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2607 ///    one's complement of the values contained in the first source operand.
2608 ///
2609 /// \headerfile <x86intrin.h>
2610 ///
2611 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2612 ///
2613 /// \param __a
2614 ///    A 128-bit vector containing the left source operand. The one's complement
2615 ///    of this value is used in the bitwise AND.
2616 /// \param __b
2617 ///    A 128-bit vector containing the right source operand.
2618 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2619 ///    complement of the first operand and the values in the second operand.
2620 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2621                                                               __m128i __b) {
2622   return (__m128i)(~(__v2du)__a & (__v2du)__b);
2623 }
2624 /// Performs a bitwise OR of two 128-bit integer vectors.
2625 ///
2626 /// \headerfile <x86intrin.h>
2627 ///
2628 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2629 ///
2630 /// \param __a
2631 ///    A 128-bit integer vector containing one of the source operands.
2632 /// \param __b
2633 ///    A 128-bit integer vector containing one of the source operands.
2634 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2635 ///    in both operands.
2636 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2637                                                           __m128i __b) {
2638   return (__m128i)((__v2du)__a | (__v2du)__b);
2639 }
2640 
2641 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2642 ///
2643 /// \headerfile <x86intrin.h>
2644 ///
2645 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2646 ///
2647 /// \param __a
2648 ///    A 128-bit integer vector containing one of the source operands.
2649 /// \param __b
2650 ///    A 128-bit integer vector containing one of the source operands.
2651 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2652 ///    values in both operands.
2653 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2654                                                            __m128i __b) {
2655   return (__m128i)((__v2du)__a ^ (__v2du)__b);
2656 }
2657 
2658 /// Left-shifts the 128-bit integer vector operand by the specified
2659 ///    number of bytes. Low-order bits are cleared.
2660 ///
2661 /// \headerfile <x86intrin.h>
2662 ///
2663 /// \code
2664 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2665 /// \endcode
2666 ///
2667 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2668 ///
2669 /// \param a
2670 ///    A 128-bit integer vector containing the source operand.
2671 /// \param imm
2672 ///    An immediate value specifying the number of bytes to left-shift operand
2673 ///    \a a.
2674 /// \returns A 128-bit integer vector containing the left-shifted value.
2675 #define _mm_slli_si128(a, imm)                                                 \
2676   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2677                                                 (int)(imm)))
2678 
2679 #define _mm_bslli_si128(a, imm)                                                \
2680   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2681                                                 (int)(imm)))
2682 
2683 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2684 ///    by the specified number of bits. Low-order bits are cleared.
2685 ///
2686 /// \headerfile <x86intrin.h>
2687 ///
2688 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2689 ///
2690 /// \param __a
2691 ///    A 128-bit integer vector containing the source operand.
2692 /// \param __count
2693 ///    An integer value specifying the number of bits to left-shift each value
2694 ///    in operand \a __a.
2695 /// \returns A 128-bit integer vector containing the left-shifted values.
2696 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2697                                                             int __count) {
2698   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2699 }
2700 
2701 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2702 ///    by the specified number of bits. Low-order bits are cleared.
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2707 ///
2708 /// \param __a
2709 ///    A 128-bit integer vector containing the source operand.
2710 /// \param __count
2711 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2712 ///    to left-shift each value in operand \a __a.
2713 /// \returns A 128-bit integer vector containing the left-shifted values.
2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2715                                                            __m128i __count) {
2716   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2717 }
2718 
2719 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2720 ///    by the specified number of bits. Low-order bits are cleared.
2721 ///
2722 /// \headerfile <x86intrin.h>
2723 ///
2724 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2725 ///
2726 /// \param __a
2727 ///    A 128-bit integer vector containing the source operand.
2728 /// \param __count
2729 ///    An integer value specifying the number of bits to left-shift each value
2730 ///    in operand \a __a.
2731 /// \returns A 128-bit integer vector containing the left-shifted values.
2732 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2733                                                             int __count) {
2734   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2735 }
2736 
2737 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2738 ///    by the specified number of bits. Low-order bits are cleared.
2739 ///
2740 /// \headerfile <x86intrin.h>
2741 ///
2742 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2743 ///
2744 /// \param __a
2745 ///    A 128-bit integer vector containing the source operand.
2746 /// \param __count
2747 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2748 ///    to left-shift each value in operand \a __a.
2749 /// \returns A 128-bit integer vector containing the left-shifted values.
2750 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2751                                                            __m128i __count) {
2752   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2753 }
2754 
2755 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2756 ///    by the specified number of bits. Low-order bits are cleared.
2757 ///
2758 /// \headerfile <x86intrin.h>
2759 ///
2760 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2761 ///
2762 /// \param __a
2763 ///    A 128-bit integer vector containing the source operand.
2764 /// \param __count
2765 ///    An integer value specifying the number of bits to left-shift each value
2766 ///    in operand \a __a.
2767 /// \returns A 128-bit integer vector containing the left-shifted values.
2768 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2769                                                             int __count) {
2770   return __builtin_ia32_psllqi128((__v2di)__a, __count);
2771 }
2772 
2773 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2774 ///    by the specified number of bits. Low-order bits are cleared.
2775 ///
2776 /// \headerfile <x86intrin.h>
2777 ///
2778 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2779 ///
2780 /// \param __a
2781 ///    A 128-bit integer vector containing the source operand.
2782 /// \param __count
2783 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2784 ///    to left-shift each value in operand \a __a.
2785 /// \returns A 128-bit integer vector containing the left-shifted values.
2786 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2787                                                            __m128i __count) {
2788   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2789 }
2790 
2791 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2792 ///    by the specified number of bits. High-order bits are filled with the sign
2793 ///    bit of the initial value.
2794 ///
2795 /// \headerfile <x86intrin.h>
2796 ///
2797 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2798 ///
2799 /// \param __a
2800 ///    A 128-bit integer vector containing the source operand.
2801 /// \param __count
2802 ///    An integer value specifying the number of bits to right-shift each value
2803 ///    in operand \a __a.
2804 /// \returns A 128-bit integer vector containing the right-shifted values.
2805 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2806                                                             int __count) {
2807   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2808 }
2809 
2810 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2811 ///    by the specified number of bits. High-order bits are filled with the sign
2812 ///    bit of the initial value.
2813 ///
2814 /// \headerfile <x86intrin.h>
2815 ///
2816 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2817 ///
2818 /// \param __a
2819 ///    A 128-bit integer vector containing the source operand.
2820 /// \param __count
2821 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2822 ///    to right-shift each value in operand \a __a.
2823 /// \returns A 128-bit integer vector containing the right-shifted values.
2824 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2825                                                            __m128i __count) {
2826   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2827 }
2828 
2829 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2830 ///    by the specified number of bits. High-order bits are filled with the sign
2831 ///    bit of the initial value.
2832 ///
2833 /// \headerfile <x86intrin.h>
2834 ///
2835 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2836 ///
2837 /// \param __a
2838 ///    A 128-bit integer vector containing the source operand.
2839 /// \param __count
2840 ///    An integer value specifying the number of bits to right-shift each value
2841 ///    in operand \a __a.
2842 /// \returns A 128-bit integer vector containing the right-shifted values.
2843 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2844                                                             int __count) {
2845   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2846 }
2847 
2848 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2849 ///    by the specified number of bits. High-order bits are filled with the sign
2850 ///    bit of the initial value.
2851 ///
2852 /// \headerfile <x86intrin.h>
2853 ///
2854 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2855 ///
2856 /// \param __a
2857 ///    A 128-bit integer vector containing the source operand.
2858 /// \param __count
2859 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2860 ///    to right-shift each value in operand \a __a.
2861 /// \returns A 128-bit integer vector containing the right-shifted values.
2862 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2863                                                            __m128i __count) {
2864   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2865 }
2866 
2867 /// Right-shifts the 128-bit integer vector operand by the specified
2868 ///    number of bytes. High-order bits are cleared.
2869 ///
2870 /// \headerfile <x86intrin.h>
2871 ///
2872 /// \code
2873 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2874 /// \endcode
2875 ///
2876 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2877 ///
2878 /// \param a
2879 ///    A 128-bit integer vector containing the source operand.
2880 /// \param imm
2881 ///    An immediate value specifying the number of bytes to right-shift operand
2882 ///    \a a.
2883 /// \returns A 128-bit integer vector containing the right-shifted value.
2884 #define _mm_srli_si128(a, imm)                                                 \
2885   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2886                                                 (int)(imm)))
2887 
2888 #define _mm_bsrli_si128(a, imm)                                                \
2889   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2890                                                 (int)(imm)))
2891 
2892 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2893 ///    operand by the specified number of bits. High-order bits are cleared.
2894 ///
2895 /// \headerfile <x86intrin.h>
2896 ///
2897 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2898 ///
2899 /// \param __a
2900 ///    A 128-bit integer vector containing the source operand.
2901 /// \param __count
2902 ///    An integer value specifying the number of bits to right-shift each value
2903 ///    in operand \a __a.
2904 /// \returns A 128-bit integer vector containing the right-shifted values.
2905 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2906                                                             int __count) {
2907   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2908 }
2909 
2910 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2911 ///    operand by the specified number of bits. High-order bits are cleared.
2912 ///
2913 /// \headerfile <x86intrin.h>
2914 ///
2915 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2916 ///
2917 /// \param __a
2918 ///    A 128-bit integer vector containing the source operand.
2919 /// \param __count
2920 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2921 ///    to right-shift each value in operand \a __a.
2922 /// \returns A 128-bit integer vector containing the right-shifted values.
2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2924                                                            __m128i __count) {
2925   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2926 }
2927 
2928 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2929 ///    operand by the specified number of bits. High-order bits are cleared.
2930 ///
2931 /// \headerfile <x86intrin.h>
2932 ///
2933 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2934 ///
2935 /// \param __a
2936 ///    A 128-bit integer vector containing the source operand.
2937 /// \param __count
2938 ///    An integer value specifying the number of bits to right-shift each value
2939 ///    in operand \a __a.
2940 /// \returns A 128-bit integer vector containing the right-shifted values.
2941 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
2942                                                             int __count) {
2943   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
2944 }
2945 
2946 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2947 ///    operand by the specified number of bits. High-order bits are cleared.
2948 ///
2949 /// \headerfile <x86intrin.h>
2950 ///
2951 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2952 ///
2953 /// \param __a
2954 ///    A 128-bit integer vector containing the source operand.
2955 /// \param __count
2956 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2957 ///    to right-shift each value in operand \a __a.
2958 /// \returns A 128-bit integer vector containing the right-shifted values.
2959 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
2960                                                            __m128i __count) {
2961   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
2962 }
2963 
2964 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2965 ///    operand by the specified number of bits. High-order bits are cleared.
2966 ///
2967 /// \headerfile <x86intrin.h>
2968 ///
2969 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2970 ///
2971 /// \param __a
2972 ///    A 128-bit integer vector containing the source operand.
2973 /// \param __count
2974 ///    An integer value specifying the number of bits to right-shift each value
2975 ///    in operand \a __a.
2976 /// \returns A 128-bit integer vector containing the right-shifted values.
2977 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
2978                                                             int __count) {
2979   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
2980 }
2981 
2982 /// Right-shifts each of 64-bit values in the 128-bit integer vector
2983 ///    operand by the specified number of bits. High-order bits are cleared.
2984 ///
2985 /// \headerfile <x86intrin.h>
2986 ///
2987 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
2988 ///
2989 /// \param __a
2990 ///    A 128-bit integer vector containing the source operand.
2991 /// \param __count
2992 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2993 ///    to right-shift each value in operand \a __a.
2994 /// \returns A 128-bit integer vector containing the right-shifted values.
2995 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
2996                                                            __m128i __count) {
2997   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
2998 }
2999 
3000 /// Compares each of the corresponding 8-bit values of the 128-bit
3001 ///    integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
3002 ///    for true.
3003 ///
3004 /// \headerfile <x86intrin.h>
3005 ///
3006 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3007 ///
3008 /// \param __a
3009 ///    A 128-bit integer vector.
3010 /// \param __b
3011 ///    A 128-bit integer vector.
3012 /// \returns A 128-bit integer vector containing the comparison results.
3013 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3014                                                             __m128i __b) {
3015   return (__m128i)((__v16qi)__a == (__v16qi)__b);
3016 }
3017 
3018 /// Compares each of the corresponding 16-bit values of the 128-bit
3019 ///    integer vectors for equality. Each comparison yields 0x0 for false,
3020 ///    0xFFFF for true.
3021 ///
3022 /// \headerfile <x86intrin.h>
3023 ///
3024 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3025 ///
3026 /// \param __a
3027 ///    A 128-bit integer vector.
3028 /// \param __b
3029 ///    A 128-bit integer vector.
3030 /// \returns A 128-bit integer vector containing the comparison results.
3031 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3032                                                              __m128i __b) {
3033   return (__m128i)((__v8hi)__a == (__v8hi)__b);
3034 }
3035 
3036 /// Compares each of the corresponding 32-bit values of the 128-bit
3037 ///    integer vectors for equality. Each comparison yields 0x0 for false,
3038 ///    0xFFFFFFFF for true.
3039 ///
3040 /// \headerfile <x86intrin.h>
3041 ///
3042 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3043 ///
3044 /// \param __a
3045 ///    A 128-bit integer vector.
3046 /// \param __b
3047 ///    A 128-bit integer vector.
3048 /// \returns A 128-bit integer vector containing the comparison results.
3049 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3050                                                              __m128i __b) {
3051   return (__m128i)((__v4si)__a == (__v4si)__b);
3052 }
3053 
3054 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3055 ///    integer vectors to determine if the values in the first operand are
3056 ///    greater than those in the second operand. Each comparison yields 0x0 for
3057 ///    false, 0xFF for true.
3058 ///
3059 /// \headerfile <x86intrin.h>
3060 ///
3061 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3062 ///
3063 /// \param __a
3064 ///    A 128-bit integer vector.
3065 /// \param __b
3066 ///    A 128-bit integer vector.
3067 /// \returns A 128-bit integer vector containing the comparison results.
3068 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3069                                                             __m128i __b) {
3070   /* This function always performs a signed comparison, but __v16qi is a char
3071      which may be signed or unsigned, so use __v16qs. */
3072   return (__m128i)((__v16qs)__a > (__v16qs)__b);
3073 }
3074 
3075 /// Compares each of the corresponding signed 16-bit values of the
3076 ///    128-bit integer vectors to determine if the values in the first operand
3077 ///    are greater than those in the second operand.
3078 ///
3079 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3080 ///
3081 /// \headerfile <x86intrin.h>
3082 ///
3083 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3084 ///
3085 /// \param __a
3086 ///    A 128-bit integer vector.
3087 /// \param __b
3088 ///    A 128-bit integer vector.
3089 /// \returns A 128-bit integer vector containing the comparison results.
3090 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3091                                                              __m128i __b) {
3092   return (__m128i)((__v8hi)__a > (__v8hi)__b);
3093 }
3094 
3095 /// Compares each of the corresponding signed 32-bit values of the
3096 ///    128-bit integer vectors to determine if the values in the first operand
3097 ///    are greater than those in the second operand.
3098 ///
3099 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3100 ///
3101 /// \headerfile <x86intrin.h>
3102 ///
3103 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3104 ///
3105 /// \param __a
3106 ///    A 128-bit integer vector.
3107 /// \param __b
3108 ///    A 128-bit integer vector.
3109 /// \returns A 128-bit integer vector containing the comparison results.
3110 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3111                                                              __m128i __b) {
3112   return (__m128i)((__v4si)__a > (__v4si)__b);
3113 }
3114 
3115 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3116 ///    integer vectors to determine if the values in the first operand are less
3117 ///    than those in the second operand.
3118 ///
3119 ///    Each comparison yields 0x0 for false, 0xFF for true.
3120 ///
3121 /// \headerfile <x86intrin.h>
3122 ///
3123 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3124 ///
3125 /// \param __a
3126 ///    A 128-bit integer vector.
3127 /// \param __b
3128 ///    A 128-bit integer vector.
3129 /// \returns A 128-bit integer vector containing the comparison results.
3130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3131                                                             __m128i __b) {
3132   return _mm_cmpgt_epi8(__b, __a);
3133 }
3134 
3135 /// Compares each of the corresponding signed 16-bit values of the
3136 ///    128-bit integer vectors to determine if the values in the first operand
3137 ///    are less than those in the second operand.
3138 ///
3139 ///    Each comparison yields 0x0 for false, 0xFFFF for true.
3140 ///
3141 /// \headerfile <x86intrin.h>
3142 ///
3143 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3144 ///
3145 /// \param __a
3146 ///    A 128-bit integer vector.
3147 /// \param __b
3148 ///    A 128-bit integer vector.
3149 /// \returns A 128-bit integer vector containing the comparison results.
3150 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3151                                                              __m128i __b) {
3152   return _mm_cmpgt_epi16(__b, __a);
3153 }
3154 
3155 /// Compares each of the corresponding signed 32-bit values of the
3156 ///    128-bit integer vectors to determine if the values in the first operand
3157 ///    are less than those in the second operand.
3158 ///
3159 ///    Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
3160 ///
3161 /// \headerfile <x86intrin.h>
3162 ///
3163 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3164 ///
3165 /// \param __a
3166 ///    A 128-bit integer vector.
3167 /// \param __b
3168 ///    A 128-bit integer vector.
3169 /// \returns A 128-bit integer vector containing the comparison results.
3170 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3171                                                              __m128i __b) {
3172   return _mm_cmpgt_epi32(__b, __a);
3173 }
3174 
3175 #ifdef __x86_64__
3176 /// Converts a 64-bit signed integer value from the second operand into a
3177 ///    double-precision value and returns it in the lower element of a [2 x
3178 ///    double] vector; the upper element of the returned vector is copied from
3179 ///    the upper element of the first operand.
3180 ///
3181 /// \headerfile <x86intrin.h>
3182 ///
3183 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3184 ///
3185 /// \param __a
3186 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3187 ///    copied to the upper 64 bits of the destination.
3188 /// \param __b
3189 ///    A 64-bit signed integer operand containing the value to be converted.
3190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3191 ///    converted value of the second operand. The upper 64 bits are copied from
3192 ///    the upper 64 bits of the first operand.
3193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3194                                                             long long __b) {
3195   __a[0] = __b;
3196   return __a;
3197 }
3198 
3199 /// Converts the first (lower) element of a vector of [2 x double] into a
3200 ///    64-bit signed integer value, according to the current rounding mode.
3201 ///
3202 /// \headerfile <x86intrin.h>
3203 ///
3204 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3205 ///
3206 /// \param __a
3207 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3208 ///    conversion.
3209 /// \returns A 64-bit signed integer containing the converted value.
3210 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3211   return __builtin_ia32_cvtsd2si64((__v2df)__a);
3212 }
3213 
3214 /// Converts the first (lower) element of a vector of [2 x double] into a
3215 ///    64-bit signed integer value, truncating the result when it is inexact.
3216 ///
3217 /// \headerfile <x86intrin.h>
3218 ///
3219 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3220 ///   instruction.
3221 ///
3222 /// \param __a
3223 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3224 ///    conversion.
3225 /// \returns A 64-bit signed integer containing the converted value.
3226 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3227   return __builtin_ia32_cvttsd2si64((__v2df)__a);
3228 }
3229 #endif
3230 
3231 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3232 ///
3233 /// \headerfile <x86intrin.h>
3234 ///
3235 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3236 ///
3237 /// \param __a
3238 ///    A 128-bit integer vector.
3239 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3240 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3241   return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3242 }
3243 
3244 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3245 ///
3246 /// \headerfile <x86intrin.h>
3247 ///
3248 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3249 ///
3250 /// \param __a
3251 ///    A 128-bit vector of [4 x float].
3252 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3253 ///    values.
3254 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3255   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3256 }
3257 
3258 /// Converts a vector of [4 x float] into a vector of [4 x i32],
3259 ///    truncating the result when it is inexact.
3260 ///
3261 /// \headerfile <x86intrin.h>
3262 ///
3263 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3264 ///   instruction.
3265 ///
3266 /// \param __a
3267 ///    A 128-bit vector of [4 x float].
3268 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3269 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3270   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3271 }
3272 
3273 /// Returns a vector of [4 x i32] where the lowest element is the input
3274 ///    operand and the remaining elements are zero.
3275 ///
3276 /// \headerfile <x86intrin.h>
3277 ///
3278 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3279 ///
3280 /// \param __a
3281 ///    A 32-bit signed integer operand.
3282 /// \returns A 128-bit vector of [4 x i32].
3283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3284   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3285 }
3286 
3287 /// Returns a vector of [2 x i64] where the lower element is the input
3288 ///    operand and the upper element is zero.
3289 ///
3290 /// \headerfile <x86intrin.h>
3291 ///
3292 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3293 /// in 64-bit mode.
3294 ///
3295 /// \param __a
3296 ///    A 64-bit signed integer operand containing the value to be converted.
3297 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3298 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3299   return __extension__(__m128i)(__v2di){__a, 0};
3300 }
3301 
3302 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3303 ///    32-bit signed integer value.
3304 ///
3305 /// \headerfile <x86intrin.h>
3306 ///
3307 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3308 ///
3309 /// \param __a
3310 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3311 ///    destination.
3312 /// \returns A 32-bit signed integer containing the moved value.
3313 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3314   __v4si __b = (__v4si)__a;
3315   return __b[0];
3316 }
3317 
3318 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3319 ///    64-bit signed integer value.
3320 ///
3321 /// \headerfile <x86intrin.h>
3322 ///
3323 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3324 ///
3325 /// \param __a
3326 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3327 ///    destination.
3328 /// \returns A 64-bit signed integer containing the moved value.
3329 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3330   return __a[0];
3331 }
3332 
3333 /// Moves packed integer values from an aligned 128-bit memory location
3334 ///    to elements in a 128-bit integer vector.
3335 ///
3336 /// \headerfile <x86intrin.h>
3337 ///
3338 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3339 ///
3340 /// \param __p
3341 ///    An aligned pointer to a memory location containing integer values.
3342 /// \returns A 128-bit integer vector containing the moved values.
3343 static __inline__ __m128i __DEFAULT_FN_ATTRS
3344 _mm_load_si128(__m128i const *__p) {
3345   return *__p;
3346 }
3347 
3348 /// Moves packed integer values from an unaligned 128-bit memory location
3349 ///    to elements in a 128-bit integer vector.
3350 ///
3351 /// \headerfile <x86intrin.h>
3352 ///
3353 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3354 ///
3355 /// \param __p
3356 ///    A pointer to a memory location containing integer values.
3357 /// \returns A 128-bit integer vector containing the moved values.
3358 static __inline__ __m128i __DEFAULT_FN_ATTRS
3359 _mm_loadu_si128(__m128i_u const *__p) {
3360   struct __loadu_si128 {
3361     __m128i_u __v;
3362   } __attribute__((__packed__, __may_alias__));
3363   return ((const struct __loadu_si128 *)__p)->__v;
3364 }
3365 
3366 /// Returns a vector of [2 x i64] where the lower element is taken from
3367 ///    the lower element of the operand, and the upper element is zero.
3368 ///
3369 /// \headerfile <x86intrin.h>
3370 ///
3371 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3372 ///
3373 /// \param __p
3374 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3375 ///    the destination.
3376 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3377 ///    moved value. The higher order bits are cleared.
3378 static __inline__ __m128i __DEFAULT_FN_ATTRS
3379 _mm_loadl_epi64(__m128i_u const *__p) {
3380   struct __mm_loadl_epi64_struct {
3381     long long __u;
3382   } __attribute__((__packed__, __may_alias__));
3383   return __extension__(__m128i){
3384       ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3385 }
3386 
3387 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3388 ///    This could be used as an argument to another intrinsic function where the
3389 ///    argument is required but the value is not actually used.
3390 ///
3391 /// \headerfile <x86intrin.h>
3392 ///
3393 /// This intrinsic has no corresponding instruction.
3394 ///
3395 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3397   return (__m128i)__builtin_ia32_undef128();
3398 }
3399 
3400 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3401 ///    the specified 64-bit integer values.
3402 ///
3403 /// \headerfile <x86intrin.h>
3404 ///
3405 /// This intrinsic is a utility function and does not correspond to a specific
3406 ///    instruction.
3407 ///
3408 /// \param __q1
3409 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3410 ///    destination vector of [2 x i64].
3411 /// \param __q0
3412 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3413 ///    destination vector of [2 x i64].
3414 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3415 ///    provided in the operands.
3416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3417                                                             long long __q0) {
3418   return __extension__(__m128i)(__v2di){__q0, __q1};
3419 }
3420 
3421 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3422 ///    the specified 64-bit integer values.
3423 ///
3424 /// \headerfile <x86intrin.h>
3425 ///
3426 /// This intrinsic is a utility function and does not correspond to a specific
3427 ///    instruction.
3428 ///
3429 /// \param __q1
3430 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3431 ///    destination vector of [2 x i64].
3432 /// \param __q0
3433 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3434 ///    destination vector of [2 x i64].
3435 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3436 ///    provided in the operands.
3437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3438                                                            __m64 __q0) {
3439   return _mm_set_epi64x((long long)__q1, (long long)__q0);
3440 }
3441 
3442 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3443 ///    the specified 32-bit integer values.
3444 ///
3445 /// \headerfile <x86intrin.h>
3446 ///
3447 /// This intrinsic is a utility function and does not correspond to a specific
3448 ///    instruction.
3449 ///
3450 /// \param __i3
3451 ///    A 32-bit integer value used to initialize bits [127:96] of the
3452 ///    destination vector.
3453 /// \param __i2
3454 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
3455 ///    vector.
3456 /// \param __i1
3457 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
3458 ///    vector.
3459 /// \param __i0
3460 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
3461 ///    vector.
3462 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3463 ///    provided in the operands.
3464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3465                                                            int __i1, int __i0) {
3466   return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3467 }
3468 
3469 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3470 ///    the specified 16-bit integer values.
3471 ///
3472 /// \headerfile <x86intrin.h>
3473 ///
3474 /// This intrinsic is a utility function and does not correspond to a specific
3475 ///    instruction.
3476 ///
3477 /// \param __w7
3478 ///    A 16-bit integer value used to initialize bits [127:112] of the
3479 ///    destination vector.
3480 /// \param __w6
3481 ///    A 16-bit integer value used to initialize bits [111:96] of the
3482 ///    destination vector.
3483 /// \param __w5
3484 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
3485 ///    vector.
3486 /// \param __w4
3487 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
3488 ///    vector.
3489 /// \param __w3
3490 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
3491 ///    vector.
3492 /// \param __w2
3493 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
3494 ///    vector.
3495 /// \param __w1
3496 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
3497 ///    vector.
3498 /// \param __w0
3499 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
3500 ///    vector.
3501 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3502 ///    provided in the operands.
3503 static __inline__ __m128i __DEFAULT_FN_ATTRS
3504 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3505               short __w2, short __w1, short __w0) {
3506   return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3507                                         __w4, __w5, __w6, __w7};
3508 }
3509 
3510 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3511 ///    the specified 8-bit integer values.
3512 ///
3513 /// \headerfile <x86intrin.h>
3514 ///
3515 /// This intrinsic is a utility function and does not correspond to a specific
3516 ///    instruction.
3517 ///
3518 /// \param __b15
3519 ///    Initializes bits [127:120] of the destination vector.
3520 /// \param __b14
3521 ///    Initializes bits [119:112] of the destination vector.
3522 /// \param __b13
3523 ///    Initializes bits [111:104] of the destination vector.
3524 /// \param __b12
3525 ///    Initializes bits [103:96] of the destination vector.
3526 /// \param __b11
3527 ///    Initializes bits [95:88] of the destination vector.
3528 /// \param __b10
3529 ///    Initializes bits [87:80] of the destination vector.
3530 /// \param __b9
3531 ///    Initializes bits [79:72] of the destination vector.
3532 /// \param __b8
3533 ///    Initializes bits [71:64] of the destination vector.
3534 /// \param __b7
3535 ///    Initializes bits [63:56] of the destination vector.
3536 /// \param __b6
3537 ///    Initializes bits [55:48] of the destination vector.
3538 /// \param __b5
3539 ///    Initializes bits [47:40] of the destination vector.
3540 /// \param __b4
3541 ///    Initializes bits [39:32] of the destination vector.
3542 /// \param __b3
3543 ///    Initializes bits [31:24] of the destination vector.
3544 /// \param __b2
3545 ///    Initializes bits [23:16] of the destination vector.
3546 /// \param __b1
3547 ///    Initializes bits [15:8] of the destination vector.
3548 /// \param __b0
3549 ///    Initializes bits [7:0] of the destination vector.
3550 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3551 ///    provided in the operands.
3552 static __inline__ __m128i __DEFAULT_FN_ATTRS
3553 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3554              char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3555              char __b4, char __b3, char __b2, char __b1, char __b0) {
3556   return __extension__(__m128i)(__v16qi){
3557       __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3558       __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3559 }
3560 
3561 /// Initializes both values in a 128-bit integer vector with the
3562 ///    specified 64-bit integer value.
3563 ///
3564 /// \headerfile <x86intrin.h>
3565 ///
3566 /// This intrinsic is a utility function and does not correspond to a specific
3567 ///    instruction.
3568 ///
3569 /// \param __q
3570 ///    Integer value used to initialize the elements of the destination integer
3571 ///    vector.
3572 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3573 ///    elements containing the value provided in the operand.
3574 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3575   return _mm_set_epi64x(__q, __q);
3576 }
3577 
3578 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3579 ///    specified 64-bit value.
3580 ///
3581 /// \headerfile <x86intrin.h>
3582 ///
3583 /// This intrinsic is a utility function and does not correspond to a specific
3584 ///    instruction.
3585 ///
3586 /// \param __q
3587 ///    A 64-bit value used to initialize the elements of the destination integer
3588 ///    vector.
3589 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3590 ///    containing the value provided in the operand.
3591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3592   return _mm_set_epi64(__q, __q);
3593 }
3594 
3595 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3596 ///    specified 32-bit value.
3597 ///
3598 /// \headerfile <x86intrin.h>
3599 ///
3600 /// This intrinsic is a utility function and does not correspond to a specific
3601 ///    instruction.
3602 ///
3603 /// \param __i
3604 ///    A 32-bit value used to initialize the elements of the destination integer
3605 ///    vector.
3606 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3607 ///    containing the value provided in the operand.
3608 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3609   return _mm_set_epi32(__i, __i, __i, __i);
3610 }
3611 
3612 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3613 ///    specified 16-bit value.
3614 ///
3615 /// \headerfile <x86intrin.h>
3616 ///
3617 /// This intrinsic is a utility function and does not correspond to a specific
3618 ///    instruction.
3619 ///
3620 /// \param __w
3621 ///    A 16-bit value used to initialize the elements of the destination integer
3622 ///    vector.
3623 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3624 ///    containing the value provided in the operand.
3625 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3626   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3627 }
3628 
3629 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3630 ///    specified 8-bit value.
3631 ///
3632 /// \headerfile <x86intrin.h>
3633 ///
3634 /// This intrinsic is a utility function and does not correspond to a specific
3635 ///    instruction.
3636 ///
3637 /// \param __b
3638 ///    An 8-bit value used to initialize the elements of the destination integer
3639 ///    vector.
3640 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3641 ///    containing the value provided in the operand.
3642 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3643   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3644                       __b, __b, __b, __b, __b);
3645 }
3646 
3647 /// Constructs a 128-bit integer vector, initialized in reverse order
3648 ///     with the specified 64-bit integral values.
3649 ///
3650 /// \headerfile <x86intrin.h>
3651 ///
3652 /// This intrinsic does not correspond to a specific instruction.
3653 ///
3654 /// \param __q0
3655 ///    A 64-bit integral value used to initialize the lower 64 bits of the
3656 ///    result.
3657 /// \param __q1
3658 ///    A 64-bit integral value used to initialize the upper 64 bits of the
3659 ///    result.
3660 /// \returns An initialized 128-bit integer vector.
3661 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3662                                                             __m64 __q1) {
3663   return _mm_set_epi64(__q1, __q0);
3664 }
3665 
3666 /// Constructs a 128-bit integer vector, initialized in reverse order
3667 ///     with the specified 32-bit integral values.
3668 ///
3669 /// \headerfile <x86intrin.h>
3670 ///
3671 /// This intrinsic is a utility function and does not correspond to a specific
3672 ///    instruction.
3673 ///
3674 /// \param __i0
3675 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
3676 /// \param __i1
3677 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
3678 /// \param __i2
3679 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
3680 /// \param __i3
3681 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
3682 /// \returns An initialized 128-bit integer vector.
3683 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3684                                                             int __i2,
3685                                                             int __i3) {
3686   return _mm_set_epi32(__i3, __i2, __i1, __i0);
3687 }
3688 
3689 /// Constructs a 128-bit integer vector, initialized in reverse order
3690 ///     with the specified 16-bit integral values.
3691 ///
3692 /// \headerfile <x86intrin.h>
3693 ///
3694 /// This intrinsic is a utility function and does not correspond to a specific
3695 ///    instruction.
3696 ///
3697 /// \param __w0
3698 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
3699 /// \param __w1
3700 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
3701 /// \param __w2
3702 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
3703 /// \param __w3
3704 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
3705 /// \param __w4
3706 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
3707 /// \param __w5
3708 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
3709 /// \param __w6
3710 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
3711 /// \param __w7
3712 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
3713 /// \returns An initialized 128-bit integer vector.
3714 static __inline__ __m128i __DEFAULT_FN_ATTRS
3715 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3716                short __w5, short __w6, short __w7) {
3717   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3718 }
3719 
3720 /// Constructs a 128-bit integer vector, initialized in reverse order
3721 ///     with the specified 8-bit integral values.
3722 ///
3723 /// \headerfile <x86intrin.h>
3724 ///
3725 /// This intrinsic is a utility function and does not correspond to a specific
3726 ///    instruction.
3727 ///
3728 /// \param __b0
3729 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
3730 /// \param __b1
3731 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
3732 /// \param __b2
3733 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
3734 /// \param __b3
3735 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
3736 /// \param __b4
3737 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
3738 /// \param __b5
3739 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
3740 /// \param __b6
3741 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
3742 /// \param __b7
3743 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
3744 /// \param __b8
3745 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
3746 /// \param __b9
3747 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
3748 /// \param __b10
3749 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
3750 /// \param __b11
3751 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
3752 /// \param __b12
3753 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
3754 /// \param __b13
3755 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
3756 /// \param __b14
3757 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
3758 /// \param __b15
3759 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
3760 /// \returns An initialized 128-bit integer vector.
3761 static __inline__ __m128i __DEFAULT_FN_ATTRS
3762 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3763               char __b6, char __b7, char __b8, char __b9, char __b10,
3764               char __b11, char __b12, char __b13, char __b14, char __b15) {
3765   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3766                       __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3767 }
3768 
3769 /// Creates a 128-bit integer vector initialized to zero.
3770 ///
3771 /// \headerfile <x86intrin.h>
3772 ///
3773 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3774 ///
3775 /// \returns An initialized 128-bit integer vector with all elements set to
3776 ///    zero.
3777 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3778   return __extension__(__m128i)(__v2di){0LL, 0LL};
3779 }
3780 
3781 /// Stores a 128-bit integer vector to a memory location aligned on a
3782 ///    128-bit boundary.
3783 ///
3784 /// \headerfile <x86intrin.h>
3785 ///
3786 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3787 ///
3788 /// \param __p
3789 ///    A pointer to an aligned memory location that will receive the integer
3790 ///    values.
3791 /// \param __b
3792 ///    A 128-bit integer vector containing the values to be moved.
3793 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3794                                                           __m128i __b) {
3795   *__p = __b;
3796 }
3797 
3798 /// Stores a 128-bit integer vector to an unaligned memory location.
3799 ///
3800 /// \headerfile <x86intrin.h>
3801 ///
3802 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3803 ///
3804 /// \param __p
3805 ///    A pointer to a memory location that will receive the integer values.
3806 /// \param __b
3807 ///    A 128-bit integer vector containing the values to be moved.
3808 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3809                                                            __m128i __b) {
3810   struct __storeu_si128 {
3811     __m128i_u __v;
3812   } __attribute__((__packed__, __may_alias__));
3813   ((struct __storeu_si128 *)__p)->__v = __b;
3814 }
3815 
3816 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3817 ///    vector.
3818 ///
3819 /// \headerfile <x86intrin.h>
3820 ///
3821 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3822 ///
3823 /// \param __p
3824 ///    A pointer to a 64-bit memory location. The address of the memory
3825 ///    location does not have to be aligned.
3826 /// \param __b
3827 ///    A 128-bit integer vector containing the value to be stored.
3828 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3829                                                           __m128i __b) {
3830   struct __storeu_si64 {
3831     long long __v;
3832   } __attribute__((__packed__, __may_alias__));
3833   ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3834 }
3835 
3836 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3837 ///    vector.
3838 ///
3839 /// \headerfile <x86intrin.h>
3840 ///
3841 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3842 ///
3843 /// \param __p
3844 ///    A pointer to a 32-bit memory location. The address of the memory
3845 ///    location does not have to be aligned.
3846 /// \param __b
3847 ///    A 128-bit integer vector containing the value to be stored.
3848 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3849                                                           __m128i __b) {
3850   struct __storeu_si32 {
3851     int __v;
3852   } __attribute__((__packed__, __may_alias__));
3853   ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3854 }
3855 
3856 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3857 ///    vector.
3858 ///
3859 /// \headerfile <x86intrin.h>
3860 ///
3861 /// This intrinsic does not correspond to a specific instruction.
3862 ///
3863 /// \param __p
3864 ///    A pointer to a 16-bit memory location. The address of the memory
3865 ///    location does not have to be aligned.
3866 /// \param __b
3867 ///    A 128-bit integer vector containing the value to be stored.
3868 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3869                                                           __m128i __b) {
3870   struct __storeu_si16 {
3871     short __v;
3872   } __attribute__((__packed__, __may_alias__));
3873   ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3874 }
3875 
3876 /// Moves bytes selected by the mask from the first operand to the
3877 ///    specified unaligned memory location. When a mask bit is 1, the
3878 ///    corresponding byte is written, otherwise it is not written.
3879 ///
3880 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3881 ///    used again soon). Exception and trap behavior for elements not selected
3882 ///    for storage to memory are implementation dependent.
3883 ///
3884 /// \headerfile <x86intrin.h>
3885 ///
3886 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3887 ///   instruction.
3888 ///
3889 /// \param __d
3890 ///    A 128-bit integer vector containing the values to be moved.
3891 /// \param __n
3892 ///    A 128-bit integer vector containing the mask. The most significant bit of
3893 ///    each byte represents the mask bits.
3894 /// \param __p
3895 ///    A pointer to an unaligned 128-bit memory location where the specified
3896 ///    values are moved.
3897 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3898                                                               __m128i __n,
3899                                                               char *__p) {
3900   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3901 }
3902 
3903 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3904 ///    a memory location.
3905 ///
3906 /// \headerfile <x86intrin.h>
3907 ///
3908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3909 ///
3910 /// \param __p
3911 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3912 ///    of the integer vector parameter.
3913 /// \param __a
3914 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3915 ///    value to be stored.
3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3917                                                            __m128i __a) {
3918   struct __mm_storel_epi64_struct {
3919     long long __u;
3920   } __attribute__((__packed__, __may_alias__));
3921   ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
3922 }
3923 
3924 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
3925 ///    aligned memory location.
3926 ///
3927 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3928 ///    used again soon).
3929 ///
3930 /// \headerfile <x86intrin.h>
3931 ///
3932 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3933 ///
3934 /// \param __p
3935 ///    A pointer to the 128-bit aligned memory location used to store the value.
3936 /// \param __a
3937 ///    A vector of [2 x double] containing the 64-bit values to be stored.
3938 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
3939                                                         __m128d __a) {
3940   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
3941 }
3942 
3943 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
3944 ///
3945 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3946 ///    used again soon).
3947 ///
3948 /// \headerfile <x86intrin.h>
3949 ///
3950 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
3951 ///
3952 /// \param __p
3953 ///    A pointer to the 128-bit aligned memory location used to store the value.
3954 /// \param __a
3955 ///    A 128-bit integer vector containing the values to be stored.
3956 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
3957                                                            __m128i __a) {
3958   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
3959 }
3960 
3961 /// Stores a 32-bit integer value in the specified memory location.
3962 ///
3963 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3964 ///    used again soon).
3965 ///
3966 /// \headerfile <x86intrin.h>
3967 ///
3968 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
3969 ///
3970 /// \param __p
3971 ///    A pointer to the 32-bit memory location used to store the value.
3972 /// \param __a
3973 ///    A 32-bit integer containing the value to be stored.
3974 static __inline__ void
3975     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3976     _mm_stream_si32(int *__p, int __a) {
3977   __builtin_ia32_movnti(__p, __a);
3978 }
3979 
3980 #ifdef __x86_64__
3981 /// Stores a 64-bit integer value in the specified memory location.
3982 ///
3983 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3984 ///    used again soon).
3985 ///
3986 /// \headerfile <x86intrin.h>
3987 ///
3988 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
3989 ///
3990 /// \param __p
3991 ///    A pointer to the 64-bit memory location used to store the value.
3992 /// \param __a
3993 ///    A 64-bit integer containing the value to be stored.
3994 static __inline__ void
3995     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
3996     _mm_stream_si64(long long *__p, long long __a) {
3997   __builtin_ia32_movnti64(__p, __a);
3998 }
3999 #endif
4000 
4001 #if defined(__cplusplus)
4002 extern "C" {
4003 #endif
4004 
4005 /// The cache line containing \a __p is flushed and invalidated from all
4006 ///    caches in the coherency domain.
4007 ///
4008 /// \headerfile <x86intrin.h>
4009 ///
4010 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4011 ///
4012 /// \param __p
4013 ///    A pointer to the memory location used to identify the cache line to be
4014 ///    flushed.
4015 void _mm_clflush(void const *__p);
4016 
4017 /// Forces strong memory ordering (serialization) between load
4018 ///    instructions preceding this instruction and load instructions following
4019 ///    this instruction, ensuring the system completes all previous loads before
4020 ///    executing subsequent loads.
4021 ///
4022 /// \headerfile <x86intrin.h>
4023 ///
4024 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4025 ///
4026 void _mm_lfence(void);
4027 
4028 /// Forces strong memory ordering (serialization) between load and store
4029 ///    instructions preceding this instruction and load and store instructions
4030 ///    following this instruction, ensuring that the system completes all
4031 ///    previous memory accesses before executing subsequent memory accesses.
4032 ///
4033 /// \headerfile <x86intrin.h>
4034 ///
4035 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4036 ///
4037 void _mm_mfence(void);
4038 
4039 #if defined(__cplusplus)
4040 } // extern "C"
4041 #endif
4042 
4043 /// Converts 16-bit signed integers from both 128-bit integer vector
4044 ///    operands into 8-bit signed integers, and packs the results into the
4045 ///    destination. Positive values greater than 0x7F are saturated to 0x7F.
4046 ///    Negative values less than 0x80 are saturated to 0x80.
4047 ///
4048 /// \headerfile <x86intrin.h>
4049 ///
4050 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4051 ///
4052 /// \param __a
4053 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4054 ///   a signed integer and is converted to a 8-bit signed integer with
4055 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4056 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4057 ///   written to the lower 64 bits of the result.
4058 /// \param __b
4059 ///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4060 ///   a signed integer and is converted to a 8-bit signed integer with
4061 ///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
4062 ///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
4063 ///   written to the higher 64 bits of the result.
4064 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4065 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4066                                                              __m128i __b) {
4067   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4068 }
4069 
4070 /// Converts 32-bit signed integers from both 128-bit integer vector
4071 ///    operands into 16-bit signed integers, and packs the results into the
4072 ///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
4073 ///    Negative values less than 0x8000 are saturated to 0x8000.
4074 ///
4075 /// \headerfile <x86intrin.h>
4076 ///
4077 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4078 ///
4079 /// \param __a
4080 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4081 ///    a signed integer and is converted to a 16-bit signed integer with
4082 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4083 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4084 ///    are written to the lower 64 bits of the result.
4085 /// \param __b
4086 ///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
4087 ///    a signed integer and is converted to a 16-bit signed integer with
4088 ///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
4089 ///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
4090 ///    are written to the higher 64 bits of the result.
4091 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4092 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4093                                                              __m128i __b) {
4094   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4095 }
4096 
4097 /// Converts 16-bit signed integers from both 128-bit integer vector
4098 ///    operands into 8-bit unsigned integers, and packs the results into the
4099 ///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
4100 ///    than 0x00 are saturated to 0x00.
4101 ///
4102 /// \headerfile <x86intrin.h>
4103 ///
4104 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4105 ///
4106 /// \param __a
4107 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4108 ///    a signed integer and is converted to an 8-bit unsigned integer with
4109 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4110 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4111 ///    written to the lower 64 bits of the result.
4112 /// \param __b
4113 ///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
4114 ///    a signed integer and is converted to an 8-bit unsigned integer with
4115 ///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
4116 ///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
4117 ///    written to the higher 64 bits of the result.
4118 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4119 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4120                                                               __m128i __b) {
4121   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4122 }
4123 
4124 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4125 ///    the immediate-value parameter as a selector.
4126 ///
4127 /// \headerfile <x86intrin.h>
4128 ///
4129 /// \code
4130 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4131 /// \endcode
4132 ///
4133 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4134 ///
4135 /// \param a
4136 ///    A 128-bit integer vector.
4137 /// \param imm
4138 ///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4139 ///    to bits[15:0] of the result. \n
4140 ///    000: assign values from bits [15:0] of \a a. \n
4141 ///    001: assign values from bits [31:16] of \a a. \n
4142 ///    010: assign values from bits [47:32] of \a a. \n
4143 ///    011: assign values from bits [63:48] of \a a. \n
4144 ///    100: assign values from bits [79:64] of \a a. \n
4145 ///    101: assign values from bits [95:80] of \a a. \n
4146 ///    110: assign values from bits [111:96] of \a a. \n
4147 ///    111: assign values from bits [127:112] of \a a.
4148 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4149 ///    integer vector parameter and the remaining bits are assigned zeros.
4150 #define _mm_extract_epi16(a, imm)                                              \
4151   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4152                                                     (int)(imm)))
4153 
4154 /// Constructs a 128-bit integer vector by first making a copy of the
4155 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
4156 ///    of an integer parameter into an offset specified by the immediate-value
4157 ///    parameter.
4158 ///
4159 /// \headerfile <x86intrin.h>
4160 ///
4161 /// \code
4162 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4163 /// \endcode
4164 ///
4165 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4166 ///
4167 /// \param a
4168 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4169 ///    result and then one of the eight elements in the result is replaced by
4170 ///    the lower 16 bits of \a b.
4171 /// \param b
4172 ///    An integer. The lower 16 bits of this parameter are written to the
4173 ///    result beginning at an offset specified by \a imm.
4174 /// \param imm
4175 ///    An immediate value specifying the bit offset in the result at which the
4176 ///    lower 16 bits of \a b are written.
4177 /// \returns A 128-bit integer vector containing the constructed values.
4178 #define _mm_insert_epi16(a, b, imm)                                            \
4179   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4180                                         (int)(imm)))
4181 
4182 /// Copies the values of the most significant bits from each 8-bit
4183 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4184 ///    value, zero-extends the value, and writes it to the destination.
4185 ///
4186 /// \headerfile <x86intrin.h>
4187 ///
4188 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4189 ///
4190 /// \param __a
4191 ///    A 128-bit integer vector containing the values with bits to be extracted.
4192 /// \returns The most significant bits from each 8-bit element in \a __a,
4193 ///    written to bits [15:0]. The other bits are assigned zeros.
4194 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4195   return __builtin_ia32_pmovmskb128((__v16qi)__a);
4196 }
4197 
4198 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4199 ///    elements of a 128-bit integer vector parameter, using the immediate-value
4200 ///    parameter as a specifier.
4201 ///
4202 /// \headerfile <x86intrin.h>
4203 ///
4204 /// \code
4205 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4206 /// \endcode
4207 ///
4208 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4209 ///
4210 /// \param a
4211 ///    A 128-bit integer vector containing the values to be copied.
4212 /// \param imm
4213 ///    An immediate value containing an 8-bit value specifying which elements to
4214 ///    copy from a. The destinations within the 128-bit destination are assigned
4215 ///    values as follows: \n
4216 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4217 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4218 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4219 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4220 ///    Bit value assignments: \n
4221 ///    00: assign values from bits [31:0] of \a a. \n
4222 ///    01: assign values from bits [63:32] of \a a. \n
4223 ///    10: assign values from bits [95:64] of \a a. \n
4224 ///    11: assign values from bits [127:96] of \a a. \n
4225 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4226 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4227 ///    <c>[b6, b4, b2, b0]</c>.
4228 /// \returns A 128-bit integer vector containing the shuffled values.
4229 #define _mm_shuffle_epi32(a, imm)                                              \
4230   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4231 
4232 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4233 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4234 ///    value parameter as a specifier.
4235 ///
4236 /// \headerfile <x86intrin.h>
4237 ///
4238 /// \code
4239 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4240 /// \endcode
4241 ///
4242 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4243 ///
4244 /// \param a
4245 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4246 ///    [127:64] of the result.
4247 /// \param imm
4248 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4249 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4250 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4251 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4252 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4253 ///    Bit value assignments: \n
4254 ///    00: assign values from bits [15:0] of \a a. \n
4255 ///    01: assign values from bits [31:16] of \a a. \n
4256 ///    10: assign values from bits [47:32] of \a a. \n
4257 ///    11: assign values from bits [63:48] of \a a. \n
4258 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4259 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4260 ///    <c>[b6, b4, b2, b0]</c>.
4261 /// \returns A 128-bit integer vector containing the shuffled values.
4262 #define _mm_shufflelo_epi16(a, imm)                                            \
4263   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4264 
4265 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4266 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4267 ///    value parameter as a specifier.
4268 ///
4269 /// \headerfile <x86intrin.h>
4270 ///
4271 /// \code
4272 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4273 /// \endcode
4274 ///
4275 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4276 ///
4277 /// \param a
4278 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4279 ///    [63:0] of the result.
4280 /// \param imm
4281 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4282 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4283 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4284 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4285 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4286 ///    Bit value assignments: \n
4287 ///    00: assign values from bits [79:64] of \a a. \n
4288 ///    01: assign values from bits [95:80] of \a a. \n
4289 ///    10: assign values from bits [111:96] of \a a. \n
4290 ///    11: assign values from bits [127:112] of \a a. \n
4291 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4292 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4293 ///    <c>[b6, b4, b2, b0]</c>.
4294 /// \returns A 128-bit integer vector containing the shuffled values.
4295 #define _mm_shufflehi_epi16(a, imm)                                            \
4296   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4297 
4298 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4299 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4300 ///
4301 /// \headerfile <x86intrin.h>
4302 ///
4303 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4304 ///   instruction.
4305 ///
4306 /// \param __a
4307 ///    A 128-bit vector of [16 x i8].
4308 ///    Bits [71:64] are written to bits [7:0] of the result. \n
4309 ///    Bits [79:72] are written to bits [23:16] of the result. \n
4310 ///    Bits [87:80] are written to bits [39:32] of the result. \n
4311 ///    Bits [95:88] are written to bits [55:48] of the result. \n
4312 ///    Bits [103:96] are written to bits [71:64] of the result. \n
4313 ///    Bits [111:104] are written to bits [87:80] of the result. \n
4314 ///    Bits [119:112] are written to bits [103:96] of the result. \n
4315 ///    Bits [127:120] are written to bits [119:112] of the result.
4316 /// \param __b
4317 ///    A 128-bit vector of [16 x i8]. \n
4318 ///    Bits [71:64] are written to bits [15:8] of the result. \n
4319 ///    Bits [79:72] are written to bits [31:24] of the result. \n
4320 ///    Bits [87:80] are written to bits [47:40] of the result. \n
4321 ///    Bits [95:88] are written to bits [63:56] of the result. \n
4322 ///    Bits [103:96] are written to bits [79:72] of the result. \n
4323 ///    Bits [111:104] are written to bits [95:88] of the result. \n
4324 ///    Bits [119:112] are written to bits [111:104] of the result. \n
4325 ///    Bits [127:120] are written to bits [127:120] of the result.
4326 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4327 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4328                                                                __m128i __b) {
4329   return (__m128i)__builtin_shufflevector(
4330       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4331       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4332 }
4333 
4334 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4335 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4336 ///
4337 /// \headerfile <x86intrin.h>
4338 ///
4339 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4340 ///   instruction.
4341 ///
4342 /// \param __a
4343 ///    A 128-bit vector of [8 x i16].
4344 ///    Bits [79:64] are written to bits [15:0] of the result. \n
4345 ///    Bits [95:80] are written to bits [47:32] of the result. \n
4346 ///    Bits [111:96] are written to bits [79:64] of the result. \n
4347 ///    Bits [127:112] are written to bits [111:96] of the result.
4348 /// \param __b
4349 ///    A 128-bit vector of [8 x i16].
4350 ///    Bits [79:64] are written to bits [31:16] of the result. \n
4351 ///    Bits [95:80] are written to bits [63:48] of the result. \n
4352 ///    Bits [111:96] are written to bits [95:80] of the result. \n
4353 ///    Bits [127:112] are written to bits [127:112] of the result.
4354 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4355 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4356                                                                 __m128i __b) {
4357   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4358                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
4359 }
4360 
4361 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4362 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4363 ///
4364 /// \headerfile <x86intrin.h>
4365 ///
4366 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4367 ///   instruction.
4368 ///
4369 /// \param __a
4370 ///    A 128-bit vector of [4 x i32]. \n
4371 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
4372 ///    Bits [127:96] are written to bits [95:64] of the destination.
4373 /// \param __b
4374 ///    A 128-bit vector of [4 x i32]. \n
4375 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
4376 ///    Bits [127:96] are written to bits [127:96] of the destination.
4377 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4379                                                                 __m128i __b) {
4380   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4381                                           4 + 3);
4382 }
4383 
4384 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4385 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4386 ///
4387 /// \headerfile <x86intrin.h>
4388 ///
4389 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4390 ///   instruction.
4391 ///
4392 /// \param __a
4393 ///    A 128-bit vector of [2 x i64]. \n
4394 ///    Bits [127:64] are written to bits [63:0] of the destination.
4395 /// \param __b
4396 ///    A 128-bit vector of [2 x i64]. \n
4397 ///    Bits [127:64] are written to bits [127:64] of the destination.
4398 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4400                                                                 __m128i __b) {
4401   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4402 }
4403 
4404 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4405 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4406 ///
4407 /// \headerfile <x86intrin.h>
4408 ///
4409 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4410 ///   instruction.
4411 ///
4412 /// \param __a
4413 ///    A 128-bit vector of [16 x i8]. \n
4414 ///    Bits [7:0] are written to bits [7:0] of the result. \n
4415 ///    Bits [15:8] are written to bits [23:16] of the result. \n
4416 ///    Bits [23:16] are written to bits [39:32] of the result. \n
4417 ///    Bits [31:24] are written to bits [55:48] of the result. \n
4418 ///    Bits [39:32] are written to bits [71:64] of the result. \n
4419 ///    Bits [47:40] are written to bits [87:80] of the result. \n
4420 ///    Bits [55:48] are written to bits [103:96] of the result. \n
4421 ///    Bits [63:56] are written to bits [119:112] of the result.
4422 /// \param __b
4423 ///    A 128-bit vector of [16 x i8].
4424 ///    Bits [7:0] are written to bits [15:8] of the result. \n
4425 ///    Bits [15:8] are written to bits [31:24] of the result. \n
4426 ///    Bits [23:16] are written to bits [47:40] of the result. \n
4427 ///    Bits [31:24] are written to bits [63:56] of the result. \n
4428 ///    Bits [39:32] are written to bits [79:72] of the result. \n
4429 ///    Bits [47:40] are written to bits [95:88] of the result. \n
4430 ///    Bits [55:48] are written to bits [111:104] of the result. \n
4431 ///    Bits [63:56] are written to bits [127:120] of the result.
4432 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4433 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4434                                                                __m128i __b) {
4435   return (__m128i)__builtin_shufflevector(
4436       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4437       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4438 }
4439 
4440 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4441 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4442 ///    [8 x i16].
4443 ///
4444 /// \headerfile <x86intrin.h>
4445 ///
4446 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4447 ///   instruction.
4448 ///
4449 /// \param __a
4450 ///    A 128-bit vector of [8 x i16].
4451 ///    Bits [15:0] are written to bits [15:0] of the result. \n
4452 ///    Bits [31:16] are written to bits [47:32] of the result. \n
4453 ///    Bits [47:32] are written to bits [79:64] of the result. \n
4454 ///    Bits [63:48] are written to bits [111:96] of the result.
4455 /// \param __b
4456 ///    A 128-bit vector of [8 x i16].
4457 ///    Bits [15:0] are written to bits [31:16] of the result. \n
4458 ///    Bits [31:16] are written to bits [63:48] of the result. \n
4459 ///    Bits [47:32] are written to bits [95:80] of the result. \n
4460 ///    Bits [63:48] are written to bits [127:112] of the result.
4461 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4462 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4463                                                                 __m128i __b) {
4464   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4465                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
4466 }
4467 
4468 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4469 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4470 ///
4471 /// \headerfile <x86intrin.h>
4472 ///
4473 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4474 ///   instruction.
4475 ///
4476 /// \param __a
4477 ///    A 128-bit vector of [4 x i32]. \n
4478 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
4479 ///    Bits [63:32] are written to bits [95:64] of the destination.
4480 /// \param __b
4481 ///    A 128-bit vector of [4 x i32]. \n
4482 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
4483 ///    Bits [63:32] are written to bits [127:96] of the destination.
4484 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4485 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4486                                                                 __m128i __b) {
4487   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4488                                           4 + 1);
4489 }
4490 
4491 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4492 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4493 ///
4494 /// \headerfile <x86intrin.h>
4495 ///
4496 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4497 ///   instruction.
4498 ///
4499 /// \param __a
4500 ///    A 128-bit vector of [2 x i64]. \n
4501 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
4502 /// \param __b
4503 ///    A 128-bit vector of [2 x i64]. \n
4504 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
4505 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4507                                                                 __m128i __b) {
4508   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4509 }
4510 
4511 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4512 ///    integer.
4513 ///
4514 /// \headerfile <x86intrin.h>
4515 ///
4516 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4517 ///
4518 /// \param __a
4519 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4520 ///    destination.
4521 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4522 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4523   return (__m64)__a[0];
4524 }
4525 
4526 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4527 ///    upper bits.
4528 ///
4529 /// \headerfile <x86intrin.h>
4530 ///
4531 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4532 ///
4533 /// \param __a
4534 ///    A 64-bit value.
4535 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4536 ///    the operand. The upper 64 bits are assigned zeros.
4537 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4538   return __extension__(__m128i)(__v2di){(long long)__a, 0};
4539 }
4540 
4541 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4542 ///    integer vector, zeroing the upper bits.
4543 ///
4544 /// \headerfile <x86intrin.h>
4545 ///
4546 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4547 ///
4548 /// \param __a
4549 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4550 ///    destination.
4551 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4552 ///    the operand. The upper 64 bits are assigned zeros.
4553 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4554   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4555 }
4556 
4557 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4558 ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4559 ///    double].
4560 ///
4561 /// \headerfile <x86intrin.h>
4562 ///
4563 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4564 ///
4565 /// \param __a
4566 ///    A 128-bit vector of [2 x double]. \n
4567 ///    Bits [127:64] are written to bits [63:0] of the destination.
4568 /// \param __b
4569 ///    A 128-bit vector of [2 x double]. \n
4570 ///    Bits [127:64] are written to bits [127:64] of the destination.
4571 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4572 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4573                                                              __m128d __b) {
4574   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4575 }
4576 
4577 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4578 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4579 ///    double].
4580 ///
4581 /// \headerfile <x86intrin.h>
4582 ///
4583 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4584 ///
4585 /// \param __a
4586 ///    A 128-bit vector of [2 x double]. \n
4587 ///    Bits [63:0] are written to bits [63:0] of the destination.
4588 /// \param __b
4589 ///    A 128-bit vector of [2 x double]. \n
4590 ///    Bits [63:0] are written to bits [127:64] of the destination.
4591 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4592 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4593                                                              __m128d __b) {
4594   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4595 }
4596 
4597 /// Extracts the sign bits of the double-precision values in the 128-bit
4598 ///    vector of [2 x double], zero-extends the value, and writes it to the
4599 ///    low-order bits of the destination.
4600 ///
4601 /// \headerfile <x86intrin.h>
4602 ///
4603 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4604 ///
4605 /// \param __a
4606 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
4607 ///    be extracted.
4608 /// \returns The sign bits from each of the double-precision elements in \a __a,
4609 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
4610 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4611   return __builtin_ia32_movmskpd((__v2df)__a);
4612 }
4613 
4614 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4615 ///    128-bit vector parameters of [2 x double], using the immediate-value
4616 ///     parameter as a specifier.
4617 ///
4618 /// \headerfile <x86intrin.h>
4619 ///
4620 /// \code
4621 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4622 /// \endcode
4623 ///
4624 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4625 ///
4626 /// \param a
4627 ///    A 128-bit vector of [2 x double].
4628 /// \param b
4629 ///    A 128-bit vector of [2 x double].
4630 /// \param i
4631 ///    An 8-bit immediate value. The least significant two bits specify which
4632 ///    elements to copy from \a a and \a b: \n
4633 ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4634 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4635 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4636 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4637 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4638 ///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4639 ///    <c>[b1, b0]</c>.
4640 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4641 #define _mm_shuffle_pd(a, b, i)                                                \
4642   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4643                                   (int)(i)))
4644 
4645 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4646 ///    floating-point vector of [4 x float].
4647 ///
4648 /// \headerfile <x86intrin.h>
4649 ///
4650 /// This intrinsic has no corresponding instruction.
4651 ///
4652 /// \param __a
4653 ///    A 128-bit floating-point vector of [2 x double].
4654 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4655 ///    bitwise pattern as the parameter.
4656 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4657   return (__m128)__a;
4658 }
4659 
4660 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4661 ///    integer vector.
4662 ///
4663 /// \headerfile <x86intrin.h>
4664 ///
4665 /// This intrinsic has no corresponding instruction.
4666 ///
4667 /// \param __a
4668 ///    A 128-bit floating-point vector of [2 x double].
4669 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4670 ///    parameter.
4671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4672   return (__m128i)__a;
4673 }
4674 
4675 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4676 ///    floating-point vector of [2 x double].
4677 ///
4678 /// \headerfile <x86intrin.h>
4679 ///
4680 /// This intrinsic has no corresponding instruction.
4681 ///
4682 /// \param __a
4683 ///    A 128-bit floating-point vector of [4 x float].
4684 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4685 ///    bitwise pattern as the parameter.
4686 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4687   return (__m128d)__a;
4688 }
4689 
4690 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4691 ///    integer vector.
4692 ///
4693 /// \headerfile <x86intrin.h>
4694 ///
4695 /// This intrinsic has no corresponding instruction.
4696 ///
4697 /// \param __a
4698 ///    A 128-bit floating-point vector of [4 x float].
4699 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4700 ///    parameter.
4701 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4702   return (__m128i)__a;
4703 }
4704 
4705 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4706 ///    of [4 x float].
4707 ///
4708 /// \headerfile <x86intrin.h>
4709 ///
4710 /// This intrinsic has no corresponding instruction.
4711 ///
4712 /// \param __a
4713 ///    A 128-bit integer vector.
4714 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4715 ///    bitwise pattern as the parameter.
4716 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4717   return (__m128)__a;
4718 }
4719 
4720 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4721 ///    of [2 x double].
4722 ///
4723 /// \headerfile <x86intrin.h>
4724 ///
4725 /// This intrinsic has no corresponding instruction.
4726 ///
4727 /// \param __a
4728 ///    A 128-bit integer vector.
4729 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4730 ///    bitwise pattern as the parameter.
4731 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4732   return (__m128d)__a;
4733 }
4734 
4735 #if defined(__cplusplus)
4736 extern "C" {
4737 #endif
4738 
4739 /// Indicates that a spin loop is being executed for the purposes of
4740 ///    optimizing power consumption during the loop.
4741 ///
4742 /// \headerfile <x86intrin.h>
4743 ///
4744 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4745 ///
4746 void _mm_pause(void);
4747 
4748 #if defined(__cplusplus)
4749 } // extern "C"
4750 #endif
4751 #undef __DEFAULT_FN_ATTRS
4752 #undef __DEFAULT_FN_ATTRS_MMX
4753 
4754 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4755 
4756 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4757 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4758 
4759 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4760 
4761 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4762 #define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4763   (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4764 
4765 #endif /* __EMMINTRIN_H */
4766