xref: /freebsd/contrib/llvm-project/clang/lib/Headers/emmintrin.h (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <xmmintrin.h>
18 
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24     __attribute__((__vector_size__(16), __aligned__(1)));
25 
26 /* Type defines.  */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31 
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36 
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38  * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40 
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46 
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50 
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS                                                     \
53   __attribute__((__always_inline__, __nodebug__,                               \
54                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX                                                 \
56   __attribute__((__always_inline__, __nodebug__,                               \
57                  __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58 
59 /// Adds lower double-precision values in both operands and returns the
60 ///    sum in the lower 64 bits of the result. The upper 64 bits of the result
61 ///    are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 ///    A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 ///    A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 ///    sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 ///    from the upper 64 bits of the first source operand.
74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75                                                         __m128d __b) {
76   __a[0] += __b[0];
77   return __a;
78 }
79 
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 ///    A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 ///    A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 ///    operands.
92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93                                                         __m128d __b) {
94   return (__m128d)((__v2df)__a + (__v2df)__b);
95 }
96 
97 /// Subtracts the lower double-precision value of the second operand
98 ///    from the lower double-precision value of the first operand and returns
99 ///    the difference in the lower 64 bits of the result. The upper 64 bits of
100 ///    the result are copied from the upper double-precision value of the first
101 ///    operand.
102 ///
103 /// \headerfile <x86intrin.h>
104 ///
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106 ///
107 /// \param __a
108 ///    A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 ///    A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 ///    difference of the lower 64 bits of both operands. The upper 64 bits are
113 ///    copied from the upper 64 bits of the first source operand.
114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115                                                         __m128d __b) {
116   __a[0] -= __b[0];
117   return __a;
118 }
119 
120 /// Subtracts two 128-bit vectors of [2 x double].
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125 ///
126 /// \param __a
127 ///    A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 ///    A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 ///    both operands.
132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133                                                         __m128d __b) {
134   return (__m128d)((__v2df)__a - (__v2df)__b);
135 }
136 
137 /// Multiplies lower double-precision values in both operands and returns
138 ///    the product in the lower 64 bits of the result. The upper 64 bits of the
139 ///    result are copied from the upper double-precision value of the first
140 ///    operand.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145 ///
146 /// \param __a
147 ///    A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 ///    A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 ///    product of the lower 64 bits of both operands. The upper 64 bits are
152 ///    copied from the upper 64 bits of the first source operand.
153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154                                                         __m128d __b) {
155   __a[0] *= __b[0];
156   return __a;
157 }
158 
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 ///    A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 ///    A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 ///    operands.
171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172                                                         __m128d __b) {
173   return (__m128d)((__v2df)__a * (__v2df)__b);
174 }
175 
176 /// Divides the lower double-precision value of the first operand by the
177 ///    lower double-precision value of the second operand and returns the
178 ///    quotient in the lower 64 bits of the result. The upper 64 bits of the
179 ///    result are copied from the upper double-precision value of the first
180 ///    operand.
181 ///
182 /// \headerfile <x86intrin.h>
183 ///
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185 ///
186 /// \param __a
187 ///    A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 ///    A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 ///    quotient of the lower 64 bits of both operands. The upper 64 bits are
192 ///    copied from the upper 64 bits of the first source operand.
193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194                                                         __m128d __b) {
195   __a[0] /= __b[0];
196   return __a;
197 }
198 
199 /// Performs an element-by-element division of two 128-bit vectors of
200 ///    [2 x double].
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205 ///
206 /// \param __a
207 ///    A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 ///    A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 ///    operands.
212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213                                                         __m128d __b) {
214   return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216 
217 /// Calculates the square root of the lower double-precision value of
218 ///    the second operand and returns it in the lower 64 bits of the result.
219 ///    The upper 64 bits of the result are copied from the upper
220 ///    double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 ///    A 128-bit vector of [2 x double] containing one of the operands. The
228 ///    upper 64 bits of this operand are copied to the upper 64 bits of the
229 ///    result.
230 /// \param __b
231 ///    A 128-bit vector of [2 x double] containing one of the operands. The
232 ///    square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 ///    square root of the lower 64 bits of operand \a __b, and whose upper 64
235 ///    bits are copied from the upper 64 bits of operand \a __a.
236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237                                                          __m128d __b) {
238   __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239   return __extension__(__m128d){__c[0], __a[1]};
240 }
241 
242 /// Calculates the square root of the each of two values stored in a
243 ///    128-bit vector of [2 x double].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248 ///
249 /// \param __a
250 ///    A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 ///    values in the operand.
253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254   return __builtin_ia32_sqrtpd((__v2df)__a);
255 }
256 
257 /// Compares lower 64-bit double-precision values of both operands, and
258 ///    returns the lesser of the pair of values in the lower 64-bits of the
259 ///    result. The upper 64 bits of the result are copied from the upper
260 ///    double-precision value of the first operand.
261 ///
262 ///    If either value in a comparison is NaN, returns the value from \a __b.
263 ///
264 /// \headerfile <x86intrin.h>
265 ///
266 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
267 ///
268 /// \param __a
269 ///    A 128-bit vector of [2 x double] containing one of the operands. The
270 ///    lower 64 bits of this operand are used in the comparison.
271 /// \param __b
272 ///    A 128-bit vector of [2 x double] containing one of the operands. The
273 ///    lower 64 bits of this operand are used in the comparison.
274 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
275 ///    minimum value between both operands. The upper 64 bits are copied from
276 ///    the upper 64 bits of the first source operand.
277 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
278                                                         __m128d __b) {
279   return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
280 }
281 
282 /// Performs element-by-element comparison of the two 128-bit vectors of
283 ///    [2 x double] and returns a vector containing the lesser of each pair of
284 ///    values.
285 ///
286 ///    If either value in a comparison is NaN, returns the value from \a __b.
287 ///
288 /// \headerfile <x86intrin.h>
289 ///
290 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
291 ///
292 /// \param __a
293 ///    A 128-bit vector of [2 x double] containing one of the operands.
294 /// \param __b
295 ///    A 128-bit vector of [2 x double] containing one of the operands.
296 /// \returns A 128-bit vector of [2 x double] containing the minimum values
297 ///    between both operands.
298 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
299                                                         __m128d __b) {
300   return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
301 }
302 
303 /// Compares lower 64-bit double-precision values of both operands, and
304 ///    returns the greater of the pair of values in the lower 64-bits of the
305 ///    result. The upper 64 bits of the result are copied from the upper
306 ///    double-precision value of the first operand.
307 ///
308 ///    If either value in a comparison is NaN, returns the value from \a __b.
309 ///
310 /// \headerfile <x86intrin.h>
311 ///
312 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
313 ///
314 /// \param __a
315 ///    A 128-bit vector of [2 x double] containing one of the operands. The
316 ///    lower 64 bits of this operand are used in the comparison.
317 /// \param __b
318 ///    A 128-bit vector of [2 x double] containing one of the operands. The
319 ///    lower 64 bits of this operand are used in the comparison.
320 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
321 ///    maximum value between both operands. The upper 64 bits are copied from
322 ///    the upper 64 bits of the first source operand.
323 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
324                                                         __m128d __b) {
325   return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
326 }
327 
328 /// Performs element-by-element comparison of the two 128-bit vectors of
329 ///    [2 x double] and returns a vector containing the greater of each pair
330 ///    of values.
331 ///
332 ///    If either value in a comparison is NaN, returns the value from \a __b.
333 ///
334 /// \headerfile <x86intrin.h>
335 ///
336 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
337 ///
338 /// \param __a
339 ///    A 128-bit vector of [2 x double] containing one of the operands.
340 /// \param __b
341 ///    A 128-bit vector of [2 x double] containing one of the operands.
342 /// \returns A 128-bit vector of [2 x double] containing the maximum values
343 ///    between both operands.
344 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
345                                                         __m128d __b) {
346   return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
347 }
348 
349 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
350 ///
351 /// \headerfile <x86intrin.h>
352 ///
353 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
354 ///
355 /// \param __a
356 ///    A 128-bit vector of [2 x double] containing one of the source operands.
357 /// \param __b
358 ///    A 128-bit vector of [2 x double] containing one of the source operands.
359 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
360 ///    values between both operands.
361 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
362                                                         __m128d __b) {
363   return (__m128d)((__v2du)__a & (__v2du)__b);
364 }
365 
366 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
367 ///    the one's complement of the values contained in the first source operand.
368 ///
369 /// \headerfile <x86intrin.h>
370 ///
371 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
372 ///
373 /// \param __a
374 ///    A 128-bit vector of [2 x double] containing the left source operand. The
375 ///    one's complement of this value is used in the bitwise AND.
376 /// \param __b
377 ///    A 128-bit vector of [2 x double] containing the right source operand.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
379 ///    values in the second operand and the one's complement of the first
380 ///    operand.
381 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
382                                                            __m128d __b) {
383   return (__m128d)(~(__v2du)__a & (__v2du)__b);
384 }
385 
386 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
387 ///
388 /// \headerfile <x86intrin.h>
389 ///
390 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
391 ///
392 /// \param __a
393 ///    A 128-bit vector of [2 x double] containing one of the source operands.
394 /// \param __b
395 ///    A 128-bit vector of [2 x double] containing one of the source operands.
396 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
397 ///    values between both operands.
398 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
399                                                        __m128d __b) {
400   return (__m128d)((__v2du)__a | (__v2du)__b);
401 }
402 
403 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
404 ///
405 /// \headerfile <x86intrin.h>
406 ///
407 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
408 ///
409 /// \param __a
410 ///    A 128-bit vector of [2 x double] containing one of the source operands.
411 /// \param __b
412 ///    A 128-bit vector of [2 x double] containing one of the source operands.
413 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
414 ///    values between both operands.
415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
416                                                         __m128d __b) {
417   return (__m128d)((__v2du)__a ^ (__v2du)__b);
418 }
419 
420 /// Compares each of the corresponding double-precision values of the
421 ///    128-bit vectors of [2 x double] for equality.
422 ///
423 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
424 ///    If either value in a comparison is NaN, returns false.
425 ///
426 /// \headerfile <x86intrin.h>
427 ///
428 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
429 ///
430 /// \param __a
431 ///    A 128-bit vector of [2 x double].
432 /// \param __b
433 ///    A 128-bit vector of [2 x double].
434 /// \returns A 128-bit vector containing the comparison results.
435 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
436                                                           __m128d __b) {
437   return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
438 }
439 
440 /// Compares each of the corresponding double-precision values of the
441 ///    128-bit vectors of [2 x double] to determine if the values in the first
442 ///    operand are less than those in the second operand.
443 ///
444 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
445 ///    If either value in a comparison is NaN, returns false.
446 ///
447 /// \headerfile <x86intrin.h>
448 ///
449 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
450 ///
451 /// \param __a
452 ///    A 128-bit vector of [2 x double].
453 /// \param __b
454 ///    A 128-bit vector of [2 x double].
455 /// \returns A 128-bit vector containing the comparison results.
456 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
457                                                           __m128d __b) {
458   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
459 }
460 
461 /// Compares each of the corresponding double-precision values of the
462 ///    128-bit vectors of [2 x double] to determine if the values in the first
463 ///    operand are less than or equal to those in the second operand.
464 ///
465 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
466 ///    If either value in a comparison is NaN, returns false.
467 ///
468 /// \headerfile <x86intrin.h>
469 ///
470 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
471 ///
472 /// \param __a
473 ///    A 128-bit vector of [2 x double].
474 /// \param __b
475 ///    A 128-bit vector of [2 x double].
476 /// \returns A 128-bit vector containing the comparison results.
477 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
478                                                           __m128d __b) {
479   return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
480 }
481 
482 /// Compares each of the corresponding double-precision values of the
483 ///    128-bit vectors of [2 x double] to determine if the values in the first
484 ///    operand are greater than those in the second operand.
485 ///
486 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
487 ///    If either value in a comparison is NaN, returns false.
488 ///
489 /// \headerfile <x86intrin.h>
490 ///
491 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
492 ///
493 /// \param __a
494 ///    A 128-bit vector of [2 x double].
495 /// \param __b
496 ///    A 128-bit vector of [2 x double].
497 /// \returns A 128-bit vector containing the comparison results.
498 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
499                                                           __m128d __b) {
500   return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
501 }
502 
503 /// Compares each of the corresponding double-precision values of the
504 ///    128-bit vectors of [2 x double] to determine if the values in the first
505 ///    operand are greater than or equal to those in the second operand.
506 ///
507 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
508 ///    If either value in a comparison is NaN, returns false.
509 ///
510 /// \headerfile <x86intrin.h>
511 ///
512 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
513 ///
514 /// \param __a
515 ///    A 128-bit vector of [2 x double].
516 /// \param __b
517 ///    A 128-bit vector of [2 x double].
518 /// \returns A 128-bit vector containing the comparison results.
519 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
520                                                           __m128d __b) {
521   return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
522 }
523 
524 /// Compares each of the corresponding double-precision values of the
525 ///    128-bit vectors of [2 x double] to determine if the values in the first
526 ///    operand are ordered with respect to those in the second operand.
527 ///
528 ///    A pair of double-precision values are ordered with respect to each
529 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
530 ///    0xFFFFFFFFFFFFFFFF for true.
531 ///
532 /// \headerfile <x86intrin.h>
533 ///
534 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
535 ///
536 /// \param __a
537 ///    A 128-bit vector of [2 x double].
538 /// \param __b
539 ///    A 128-bit vector of [2 x double].
540 /// \returns A 128-bit vector containing the comparison results.
541 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
542                                                            __m128d __b) {
543   return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
544 }
545 
546 /// Compares each of the corresponding double-precision values of the
547 ///    128-bit vectors of [2 x double] to determine if the values in the first
548 ///    operand are unordered with respect to those in the second operand.
549 ///
550 ///    A pair of double-precision values are unordered with respect to each
551 ///    other if one or both values are NaN. Each comparison returns 0x0 for
552 ///    false, 0xFFFFFFFFFFFFFFFF for true.
553 ///
554 /// \headerfile <x86intrin.h>
555 ///
556 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
557 ///   instruction.
558 ///
559 /// \param __a
560 ///    A 128-bit vector of [2 x double].
561 /// \param __b
562 ///    A 128-bit vector of [2 x double].
563 /// \returns A 128-bit vector containing the comparison results.
564 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
565                                                              __m128d __b) {
566   return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
567 }
568 
569 /// Compares each of the corresponding double-precision values of the
570 ///    128-bit vectors of [2 x double] to determine if the values in the first
571 ///    operand are unequal to those in the second operand.
572 ///
573 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
574 ///    If either value in a comparison is NaN, returns true.
575 ///
576 /// \headerfile <x86intrin.h>
577 ///
578 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
579 ///
580 /// \param __a
581 ///    A 128-bit vector of [2 x double].
582 /// \param __b
583 ///    A 128-bit vector of [2 x double].
584 /// \returns A 128-bit vector containing the comparison results.
585 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
586                                                            __m128d __b) {
587   return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
588 }
589 
590 /// Compares each of the corresponding double-precision values of the
591 ///    128-bit vectors of [2 x double] to determine if the values in the first
592 ///    operand are not less than those in the second operand.
593 ///
594 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
595 ///    If either value in a comparison is NaN, returns true.
596 ///
597 /// \headerfile <x86intrin.h>
598 ///
599 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
600 ///
601 /// \param __a
602 ///    A 128-bit vector of [2 x double].
603 /// \param __b
604 ///    A 128-bit vector of [2 x double].
605 /// \returns A 128-bit vector containing the comparison results.
606 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
607                                                            __m128d __b) {
608   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
609 }
610 
611 /// Compares each of the corresponding double-precision values of the
612 ///    128-bit vectors of [2 x double] to determine if the values in the first
613 ///    operand are not less than or equal to those in the second operand.
614 ///
615 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
616 ///    If either value in a comparison is NaN, returns true.
617 ///
618 /// \headerfile <x86intrin.h>
619 ///
620 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
621 ///
622 /// \param __a
623 ///    A 128-bit vector of [2 x double].
624 /// \param __b
625 ///    A 128-bit vector of [2 x double].
626 /// \returns A 128-bit vector containing the comparison results.
627 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
628                                                            __m128d __b) {
629   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
630 }
631 
632 /// Compares each of the corresponding double-precision values of the
633 ///    128-bit vectors of [2 x double] to determine if the values in the first
634 ///    operand are not greater than those in the second operand.
635 ///
636 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
637 ///    If either value in a comparison is NaN, returns true.
638 ///
639 /// \headerfile <x86intrin.h>
640 ///
641 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
642 ///
643 /// \param __a
644 ///    A 128-bit vector of [2 x double].
645 /// \param __b
646 ///    A 128-bit vector of [2 x double].
647 /// \returns A 128-bit vector containing the comparison results.
648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
649                                                            __m128d __b) {
650   return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
651 }
652 
653 /// Compares each of the corresponding double-precision values of the
654 ///    128-bit vectors of [2 x double] to determine if the values in the first
655 ///    operand are not greater than or equal to those in the second operand.
656 ///
657 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658 ///    If either value in a comparison is NaN, returns true.
659 ///
660 /// \headerfile <x86intrin.h>
661 ///
662 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
663 ///
664 /// \param __a
665 ///    A 128-bit vector of [2 x double].
666 /// \param __b
667 ///    A 128-bit vector of [2 x double].
668 /// \returns A 128-bit vector containing the comparison results.
669 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
670                                                            __m128d __b) {
671   return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
672 }
673 
674 /// Compares the lower double-precision floating-point values in each of
675 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
676 ///
677 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
678 ///    If either value in a comparison is NaN, returns false.
679 ///
680 /// \headerfile <x86intrin.h>
681 ///
682 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
683 ///
684 /// \param __a
685 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
686 ///    compared to the lower double-precision value of \a __b.
687 /// \param __b
688 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
689 ///    compared to the lower double-precision value of \a __a.
690 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
691 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
692 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
693                                                           __m128d __b) {
694   return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
695 }
696 
697 /// Compares the lower double-precision floating-point values in each of
698 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
699 ///    the value in the first parameter is less than the corresponding value in
700 ///    the second parameter.
701 ///
702 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
703 ///    If either value in a comparison is NaN, returns false.
704 ///
705 /// \headerfile <x86intrin.h>
706 ///
707 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
708 ///
709 /// \param __a
710 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
711 ///    compared to the lower double-precision value of \a __b.
712 /// \param __b
713 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
714 ///    compared to the lower double-precision value of \a __a.
715 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
716 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
717 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
718                                                           __m128d __b) {
719   return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
720 }
721 
722 /// Compares the lower double-precision floating-point values in each of
723 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
724 ///    the value in the first parameter is less than or equal to the
725 ///    corresponding value in the second parameter.
726 ///
727 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
728 ///    If either value in a comparison is NaN, returns false.
729 ///
730 /// \headerfile <x86intrin.h>
731 ///
732 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
733 ///
734 /// \param __a
735 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
736 ///    compared to the lower double-precision value of \a __b.
737 /// \param __b
738 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
739 ///    compared to the lower double-precision value of \a __a.
740 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
741 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
742 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
743                                                           __m128d __b) {
744   return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
745 }
746 
747 /// Compares the lower double-precision floating-point values in each of
748 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
749 ///    the value in the first parameter is greater than the corresponding value
750 ///    in the second parameter.
751 ///
752 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
753 ///    If either value in a comparison is NaN, returns false.
754 ///
755 /// \headerfile <x86intrin.h>
756 ///
757 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
758 ///
759 /// \param __a
760 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
761 ///     compared to the lower double-precision value of \a __b.
762 /// \param __b
763 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
764 ///     compared to the lower double-precision value of \a __a.
765 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
766 ///     results. The upper 64 bits are copied from the upper 64 bits of \a __a.
767 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
768                                                           __m128d __b) {
769   __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
770   return __extension__(__m128d){__c[0], __a[1]};
771 }
772 
773 /// Compares the lower double-precision floating-point values in each of
774 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
775 ///    the value in the first parameter is greater than or equal to the
776 ///    corresponding value in the second parameter.
777 ///
778 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
779 ///    If either value in a comparison is NaN, returns false.
780 ///
781 /// \headerfile <x86intrin.h>
782 ///
783 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
784 ///
785 /// \param __a
786 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
787 ///    compared to the lower double-precision value of \a __b.
788 /// \param __b
789 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
790 ///    compared to the lower double-precision value of \a __a.
791 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
792 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
793 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
794                                                           __m128d __b) {
795   __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
796   return __extension__(__m128d){__c[0], __a[1]};
797 }
798 
799 /// Compares the lower double-precision floating-point values in each of
800 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
801 ///    the value in the first parameter is ordered with respect to the
802 ///    corresponding value in the second parameter.
803 ///
804 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
805 ///    of double-precision values are ordered with respect to each other if
806 ///    neither value is a NaN.
807 ///
808 /// \headerfile <x86intrin.h>
809 ///
810 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
811 ///
812 /// \param __a
813 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
814 ///    compared to the lower double-precision value of \a __b.
815 /// \param __b
816 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
817 ///    compared to the lower double-precision value of \a __a.
818 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
819 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
820 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
821                                                            __m128d __b) {
822   return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
823 }
824 
825 /// Compares the lower double-precision floating-point values in each of
826 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
827 ///    the value in the first parameter is unordered with respect to the
828 ///    corresponding value in the second parameter.
829 ///
830 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
831 ///    of double-precision values are unordered with respect to each other if
832 ///    one or both values are NaN.
833 ///
834 /// \headerfile <x86intrin.h>
835 ///
836 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
837 ///   instruction.
838 ///
839 /// \param __a
840 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
841 ///    compared to the lower double-precision value of \a __b.
842 /// \param __b
843 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
844 ///    compared to the lower double-precision value of \a __a.
845 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
846 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
847 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
848                                                              __m128d __b) {
849   return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
850 }
851 
852 /// Compares the lower double-precision floating-point values in each of
853 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
854 ///    the value in the first parameter is unequal to the corresponding value in
855 ///    the second parameter.
856 ///
857 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
858 ///    If either value in a comparison is NaN, returns true.
859 ///
860 /// \headerfile <x86intrin.h>
861 ///
862 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
863 ///
864 /// \param __a
865 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
866 ///    compared to the lower double-precision value of \a __b.
867 /// \param __b
868 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
869 ///    compared to the lower double-precision value of \a __a.
870 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
871 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
872 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
873                                                            __m128d __b) {
874   return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
875 }
876 
877 /// Compares the lower double-precision floating-point values in each of
878 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
879 ///    the value in the first parameter is not less than the corresponding
880 ///    value in the second parameter.
881 ///
882 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
883 ///    If either value in a comparison is NaN, returns true.
884 ///
885 /// \headerfile <x86intrin.h>
886 ///
887 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
888 ///
889 /// \param __a
890 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
891 ///    compared to the lower double-precision value of \a __b.
892 /// \param __b
893 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
894 ///    compared to the lower double-precision value of \a __a.
895 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
896 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
897 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
898                                                            __m128d __b) {
899   return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
900 }
901 
902 /// Compares the lower double-precision floating-point values in each of
903 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
904 ///    the value in the first parameter is not less than or equal to the
905 ///    corresponding value in the second parameter.
906 ///
907 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
908 ///    If either value in a comparison is NaN, returns true.
909 ///
910 /// \headerfile <x86intrin.h>
911 ///
912 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
913 ///
914 /// \param __a
915 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
916 ///    compared to the lower double-precision value of \a __b.
917 /// \param __b
918 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
919 ///    compared to the lower double-precision value of \a __a.
920 /// \returns  A 128-bit vector. The lower 64 bits contains the comparison
921 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
922 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
923                                                            __m128d __b) {
924   return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
925 }
926 
927 /// Compares the lower double-precision floating-point values in each of
928 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
929 ///    the value in the first parameter is not greater than the corresponding
930 ///    value in the second parameter.
931 ///
932 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
933 ///    If either value in a comparison is NaN, returns true.
934 ///
935 /// \headerfile <x86intrin.h>
936 ///
937 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
938 ///
939 /// \param __a
940 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
941 ///    compared to the lower double-precision value of \a __b.
942 /// \param __b
943 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
944 ///    compared to the lower double-precision value of \a __a.
945 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
946 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
947 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
948                                                            __m128d __b) {
949   __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
950   return __extension__(__m128d){__c[0], __a[1]};
951 }
952 
953 /// Compares the lower double-precision floating-point values in each of
954 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
955 ///    the value in the first parameter is not greater than or equal to the
956 ///    corresponding value in the second parameter.
957 ///
958 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
959 ///    If either value in a comparison is NaN, returns true.
960 ///
961 /// \headerfile <x86intrin.h>
962 ///
963 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
964 ///
965 /// \param __a
966 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
967 ///    compared to the lower double-precision value of \a __b.
968 /// \param __b
969 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
970 ///    compared to the lower double-precision value of \a __a.
971 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
972 ///    results. The upper 64 bits are copied from the upper 64 bits of \a __a.
973 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
974                                                            __m128d __b) {
975   __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
976   return __extension__(__m128d){__c[0], __a[1]};
977 }
978 
979 /// Compares the lower double-precision floating-point values in each of
980 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
981 ///
982 ///    The comparison returns 0 for false, 1 for true. If either value in a
983 ///    comparison is NaN, returns 0.
984 ///
985 /// \headerfile <x86intrin.h>
986 ///
987 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
988 ///
989 /// \param __a
990 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
991 ///    compared to the lower double-precision value of \a __b.
992 /// \param __b
993 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
994 ///    compared to the lower double-precision value of \a __a.
995 /// \returns An integer containing the comparison results.
996 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
997                                                        __m128d __b) {
998   return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
999 }
1000 
1001 /// Compares the lower double-precision floating-point values in each of
1002 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1003 ///    the value in the first parameter is less than the corresponding value in
1004 ///    the second parameter.
1005 ///
1006 ///    The comparison returns 0 for false, 1 for true. If either value in a
1007 ///    comparison is NaN, returns 0.
1008 ///
1009 /// \headerfile <x86intrin.h>
1010 ///
1011 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1012 ///
1013 /// \param __a
1014 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1015 ///    compared to the lower double-precision value of \a __b.
1016 /// \param __b
1017 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1018 ///    compared to the lower double-precision value of \a __a.
1019 /// \returns An integer containing the comparison results.
1020 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1021                                                        __m128d __b) {
1022   return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1023 }
1024 
1025 /// Compares the lower double-precision floating-point values in each of
1026 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1027 ///    the value in the first parameter is less than or equal to the
1028 ///    corresponding value in the second parameter.
1029 ///
1030 ///    The comparison returns 0 for false, 1 for true. If either value in a
1031 ///    comparison is NaN, returns 0.
1032 ///
1033 /// \headerfile <x86intrin.h>
1034 ///
1035 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1036 ///
1037 /// \param __a
1038 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1039 ///    compared to the lower double-precision value of \a __b.
1040 /// \param __b
1041 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1042 ///     compared to the lower double-precision value of \a __a.
1043 /// \returns An integer containing the comparison results.
1044 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1045                                                        __m128d __b) {
1046   return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1047 }
1048 
1049 /// Compares the lower double-precision floating-point values in each of
1050 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1051 ///    the value in the first parameter is greater than the corresponding value
1052 ///    in the second parameter.
1053 ///
1054 ///    The comparison returns 0 for false, 1 for true. If either value in a
1055 ///    comparison is NaN, returns 0.
1056 ///
1057 /// \headerfile <x86intrin.h>
1058 ///
1059 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1060 ///
1061 /// \param __a
1062 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1063 ///    compared to the lower double-precision value of \a __b.
1064 /// \param __b
1065 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1066 ///    compared to the lower double-precision value of \a __a.
1067 /// \returns An integer containing the comparison results.
1068 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1069                                                        __m128d __b) {
1070   return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1071 }
1072 
1073 /// Compares the lower double-precision floating-point values in each of
1074 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1075 ///    the value in the first parameter is greater than or equal to the
1076 ///    corresponding value in the second parameter.
1077 ///
1078 ///    The comparison returns 0 for false, 1 for true. If either value in a
1079 ///    comparison is NaN, returns 0.
1080 ///
1081 /// \headerfile <x86intrin.h>
1082 ///
1083 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1084 ///
1085 /// \param __a
1086 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1087 ///    compared to the lower double-precision value of \a __b.
1088 /// \param __b
1089 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1090 ///    compared to the lower double-precision value of \a __a.
1091 /// \returns An integer containing the comparison results.
1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1093                                                        __m128d __b) {
1094   return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1095 }
1096 
1097 /// Compares the lower double-precision floating-point values in each of
1098 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1099 ///    the value in the first parameter is unequal to the corresponding value in
1100 ///    the second parameter.
1101 ///
1102 ///    The comparison returns 0 for false, 1 for true. If either value in a
1103 ///    comparison is NaN, returns 1.
1104 ///
1105 /// \headerfile <x86intrin.h>
1106 ///
1107 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1108 ///
1109 /// \param __a
1110 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1111 ///    compared to the lower double-precision value of \a __b.
1112 /// \param __b
1113 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1114 ///    compared to the lower double-precision value of \a __a.
1115 /// \returns An integer containing the comparison results.
1116 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1117                                                         __m128d __b) {
1118   return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1119 }
1120 
1121 /// Compares the lower double-precision floating-point values in each of
1122 ///    the two 128-bit floating-point vectors of [2 x double] for equality.
1123 ///
1124 ///    The comparison returns 0 for false, 1 for true. If either value in a
1125 ///    comparison is NaN, returns 0.
1126 ///
1127 /// \headerfile <x86intrin.h>
1128 ///
1129 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1130 ///
1131 /// \param __a
1132 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1133 ///    compared to the lower double-precision value of \a __b.
1134 /// \param __b
1135 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1136 ///    compared to the lower double-precision value of \a __a.
1137 /// \returns An integer containing the comparison results.
1138 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1139                                                         __m128d __b) {
1140   return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1141 }
1142 
1143 /// Compares the lower double-precision floating-point values in each of
1144 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1145 ///    the value in the first parameter is less than the corresponding value in
1146 ///    the second parameter.
1147 ///
1148 ///    The comparison returns 0 for false, 1 for true. If either value in a
1149 ///    comparison is NaN, returns 0.
1150 ///
1151 /// \headerfile <x86intrin.h>
1152 ///
1153 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1154 ///
1155 /// \param __a
1156 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1157 ///    compared to the lower double-precision value of \a __b.
1158 /// \param __b
1159 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1160 ///    compared to the lower double-precision value of \a __a.
1161 /// \returns An integer containing the comparison results.
1162 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1163                                                         __m128d __b) {
1164   return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1165 }
1166 
1167 /// Compares the lower double-precision floating-point values in each of
1168 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1169 ///    the value in the first parameter is less than or equal to the
1170 ///    corresponding value in the second parameter.
1171 ///
1172 ///    The comparison returns 0 for false, 1 for true. If either value in a
1173 ///    comparison is NaN, returns 0.
1174 ///
1175 /// \headerfile <x86intrin.h>
1176 ///
1177 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1178 ///
1179 /// \param __a
1180 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1181 ///    compared to the lower double-precision value of \a __b.
1182 /// \param __b
1183 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1184 ///     compared to the lower double-precision value of \a __a.
1185 /// \returns An integer containing the comparison results.
1186 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1187                                                         __m128d __b) {
1188   return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1189 }
1190 
1191 /// Compares the lower double-precision floating-point values in each of
1192 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1193 ///    the value in the first parameter is greater than the corresponding value
1194 ///    in the second parameter.
1195 ///
1196 ///    The comparison returns 0 for false, 1 for true. If either value in a
1197 ///    comparison is NaN, returns 0.
1198 ///
1199 /// \headerfile <x86intrin.h>
1200 ///
1201 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1202 ///
1203 /// \param __a
1204 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1205 ///    compared to the lower double-precision value of \a __b.
1206 /// \param __b
1207 ///     A 128-bit vector of [2 x double]. The lower double-precision value is
1208 ///     compared to the lower double-precision value of \a __a.
1209 /// \returns An integer containing the comparison results.
1210 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1211                                                         __m128d __b) {
1212   return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1213 }
1214 
1215 /// Compares the lower double-precision floating-point values in each of
1216 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1217 ///    the value in the first parameter is greater than or equal to the
1218 ///    corresponding value in the second parameter.
1219 ///
1220 ///    The comparison returns 0 for false, 1 for true. If either value in a
1221 ///    comparison is NaN, returns 0.
1222 ///
1223 /// \headerfile <x86intrin.h>
1224 ///
1225 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1226 ///
1227 /// \param __a
1228 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1229 ///    compared to the lower double-precision value of \a __b.
1230 /// \param __b
1231 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1232 ///    compared to the lower double-precision value of \a __a.
1233 /// \returns An integer containing the comparison results.
1234 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1235                                                         __m128d __b) {
1236   return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1237 }
1238 
1239 /// Compares the lower double-precision floating-point values in each of
1240 ///    the two 128-bit floating-point vectors of [2 x double] to determine if
1241 ///    the value in the first parameter is unequal to the corresponding value in
1242 ///    the second parameter.
1243 ///
1244 ///    The comparison returns 0 for false, 1 for true. If either value in a
1245 ///    comparison is NaN, returns 1.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1250 ///
1251 /// \param __a
1252 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1253 ///    compared to the lower double-precision value of \a __b.
1254 /// \param __b
1255 ///    A 128-bit vector of [2 x double]. The lower double-precision value is
1256 ///    compared to the lower double-precision value of \a __a.
1257 /// \returns An integer containing the comparison result.
1258 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1259                                                          __m128d __b) {
1260   return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1261 }
1262 
1263 /// Converts the two double-precision floating-point elements of a
1264 ///    128-bit vector of [2 x double] into two single-precision floating-point
1265 ///    values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1266 ///    The upper 64 bits of the result vector are set to zero.
1267 ///
1268 /// \headerfile <x86intrin.h>
1269 ///
1270 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1271 ///
1272 /// \param __a
1273 ///    A 128-bit vector of [2 x double].
1274 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1275 ///    converted values. The upper 64 bits are set to zero.
1276 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1277   return __builtin_ia32_cvtpd2ps((__v2df)__a);
1278 }
1279 
1280 /// Converts the lower two single-precision floating-point elements of a
1281 ///    128-bit vector of [4 x float] into two double-precision floating-point
1282 ///    values, returned in a 128-bit vector of [2 x double]. The upper two
1283 ///    elements of the input vector are unused.
1284 ///
1285 /// \headerfile <x86intrin.h>
1286 ///
1287 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1288 ///
1289 /// \param __a
1290 ///    A 128-bit vector of [4 x float]. The lower two single-precision
1291 ///    floating-point elements are converted to double-precision values. The
1292 ///    upper two elements are unused.
1293 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1295   return (__m128d) __builtin_convertvector(
1296       __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1297 }
1298 
1299 /// Converts the lower two integer elements of a 128-bit vector of
1300 ///    [4 x i32] into two double-precision floating-point values, returned in a
1301 ///    128-bit vector of [2 x double].
1302 ///
1303 ///    The upper two elements of the input vector are unused.
1304 ///
1305 /// \headerfile <x86intrin.h>
1306 ///
1307 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1308 ///
1309 /// \param __a
1310 ///    A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1311 ///    converted to double-precision values.
1312 ///
1313 ///    The upper two elements are unused.
1314 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1315 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1316   return (__m128d) __builtin_convertvector(
1317       __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1318 }
1319 
1320 /// Converts the two double-precision floating-point elements of a
1321 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1322 ///    returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1323 ///    64 bits of the result vector are set to zero.
1324 ///
1325 ///    If a converted value does not fit in a 32-bit integer, raises a
1326 ///    floating-point invalid exception. If the exception is masked, returns
1327 ///    the most negative integer.
1328 ///
1329 /// \headerfile <x86intrin.h>
1330 ///
1331 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1332 ///
1333 /// \param __a
1334 ///    A 128-bit vector of [2 x double].
1335 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1336 ///    converted values. The upper 64 bits are set to zero.
1337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1338   return __builtin_ia32_cvtpd2dq((__v2df)__a);
1339 }
1340 
1341 /// Converts the low-order element of a 128-bit vector of [2 x double]
1342 ///    into a 32-bit signed integer value.
1343 ///
1344 ///    If the converted value does not fit in a 32-bit integer, raises a
1345 ///    floating-point invalid exception. If the exception is masked, returns
1346 ///    the most negative integer.
1347 ///
1348 /// \headerfile <x86intrin.h>
1349 ///
1350 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1351 ///
1352 /// \param __a
1353 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1354 ///    conversion.
1355 /// \returns A 32-bit signed integer containing the converted value.
1356 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1357   return __builtin_ia32_cvtsd2si((__v2df)__a);
1358 }
1359 
1360 /// Converts the lower double-precision floating-point element of a
1361 ///    128-bit vector of [2 x double], in the second parameter, into a
1362 ///    single-precision floating-point value, returned in the lower 32 bits of a
1363 ///    128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1364 ///    copied from the upper 96 bits of the first parameter.
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1369 ///
1370 /// \param __a
1371 ///    A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1372 ///    copied to the upper 96 bits of the result.
1373 /// \param __b
1374 ///    A 128-bit vector of [2 x double]. The lower double-precision
1375 ///    floating-point element is used in the conversion.
1376 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1377 ///    converted value from the second parameter. The upper 96 bits are copied
1378 ///    from the upper 96 bits of the first parameter.
1379 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1380                                                          __m128d __b) {
1381   return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1382 }
1383 
1384 /// Converts a 32-bit signed integer value, in the second parameter, into
1385 ///    a double-precision floating-point value, returned in the lower 64 bits of
1386 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1387 ///    are copied from the upper 64 bits of the first parameter.
1388 ///
1389 /// \headerfile <x86intrin.h>
1390 ///
1391 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1392 ///
1393 /// \param __a
1394 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1395 ///    copied to the upper 64 bits of the result.
1396 /// \param __b
1397 ///    A 32-bit signed integer containing the value to be converted.
1398 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1399 ///    converted value from the second parameter. The upper 64 bits are copied
1400 ///    from the upper 64 bits of the first parameter.
1401 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1402                                                             int __b) {
1403   __a[0] = __b;
1404   return __a;
1405 }
1406 
1407 /// Converts the lower single-precision floating-point element of a
1408 ///    128-bit vector of [4 x float], in the second parameter, into a
1409 ///    double-precision floating-point value, returned in the lower 64 bits of
1410 ///    a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1411 ///    are copied from the upper 64 bits of the first parameter.
1412 ///
1413 /// \headerfile <x86intrin.h>
1414 ///
1415 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1416 ///
1417 /// \param __a
1418 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1419 ///    copied to the upper 64 bits of the result.
1420 /// \param __b
1421 ///    A 128-bit vector of [4 x float]. The lower single-precision
1422 ///    floating-point element is used in the conversion.
1423 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1424 ///    converted value from the second parameter. The upper 64 bits are copied
1425 ///    from the upper 64 bits of the first parameter.
1426 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1427                                                           __m128 __b) {
1428   __a[0] = __b[0];
1429   return __a;
1430 }
1431 
1432 /// Converts the two double-precision floating-point elements of a
1433 ///    128-bit vector of [2 x double] into two signed truncated (rounded
1434 ///    toward zero) 32-bit integer values, returned in the lower 64 bits
1435 ///    of a 128-bit vector of [4 x i32].
1436 ///
1437 ///    If a converted value does not fit in a 32-bit integer, raises a
1438 ///    floating-point invalid exception. If the exception is masked, returns
1439 ///    the most negative integer.
1440 ///
1441 /// \headerfile <x86intrin.h>
1442 ///
1443 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1444 ///   instruction.
1445 ///
1446 /// \param __a
1447 ///    A 128-bit vector of [2 x double].
1448 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1449 ///    converted values. The upper 64 bits are set to zero.
1450 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1451   return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1452 }
1453 
1454 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1455 ///    signed truncated (rounded toward zero) integer value.
1456 ///
1457 ///    If the converted value does not fit in a 32-bit integer, raises a
1458 ///    floating-point invalid exception. If the exception is masked, returns
1459 ///    the most negative integer.
1460 ///
1461 /// \headerfile <x86intrin.h>
1462 ///
1463 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1464 ///   instruction.
1465 ///
1466 /// \param __a
1467 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1468 ///    conversion.
1469 /// \returns A 32-bit signed integer containing the converted value.
1470 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1471   return __builtin_ia32_cvttsd2si((__v2df)__a);
1472 }
1473 
1474 /// Converts the two double-precision floating-point elements of a
1475 ///    128-bit vector of [2 x double] into two signed 32-bit integer values,
1476 ///    returned in a 64-bit vector of [2 x i32].
1477 ///
1478 ///    If a converted value does not fit in a 32-bit integer, raises a
1479 ///    floating-point invalid exception. If the exception is masked, returns
1480 ///    the most negative integer.
1481 ///
1482 /// \headerfile <x86intrin.h>
1483 ///
1484 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1485 ///
1486 /// \param __a
1487 ///    A 128-bit vector of [2 x double].
1488 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1490   return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1491 }
1492 
1493 /// Converts the two double-precision floating-point elements of a
1494 ///    128-bit vector of [2 x double] into two signed truncated (rounded toward
1495 ///    zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1496 ///
1497 ///    If a converted value does not fit in a 32-bit integer, raises a
1498 ///    floating-point invalid exception. If the exception is masked, returns
1499 ///    the most negative integer.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1504 ///
1505 /// \param __a
1506 ///    A 128-bit vector of [2 x double].
1507 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
1508 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1509   return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1510 }
1511 
1512 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1513 ///    [2 x i32] into two double-precision floating-point values, returned in a
1514 ///    128-bit vector of [2 x double].
1515 ///
1516 /// \headerfile <x86intrin.h>
1517 ///
1518 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1519 ///
1520 /// \param __a
1521 ///    A 64-bit vector of [2 x i32].
1522 /// \returns A 128-bit vector of [2 x double] containing the converted values.
1523 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1524   return __builtin_ia32_cvtpi2pd((__v2si)__a);
1525 }
1526 
1527 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1528 ///    a double-precision floating-point value.
1529 ///
1530 /// \headerfile <x86intrin.h>
1531 ///
1532 /// This intrinsic has no corresponding instruction.
1533 ///
1534 /// \param __a
1535 ///    A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1536 /// \returns A double-precision floating-point value copied from the lower 64
1537 ///    bits of \a __a.
1538 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1539   return __a[0];
1540 }
1541 
1542 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1543 ///    memory location.
1544 ///
1545 /// \headerfile <x86intrin.h>
1546 ///
1547 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1548 ///
1549 /// \param __dp
1550 ///    A pointer to a 128-bit memory location. The address of the memory
1551 ///    location has to be 16-byte aligned.
1552 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1553 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1554   return *(const __m128d *)__dp;
1555 }
1556 
1557 /// Loads a double-precision floating-point value from a specified memory
1558 ///    location and duplicates it to both vector elements of a 128-bit vector of
1559 ///    [2 x double].
1560 ///
1561 /// \headerfile <x86intrin.h>
1562 ///
1563 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1564 ///
1565 /// \param __dp
1566 ///    A pointer to a memory location containing a double-precision value.
1567 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1568 ///    duplicated values.
1569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1570   struct __mm_load1_pd_struct {
1571     double __u;
1572   } __attribute__((__packed__, __may_alias__));
1573   double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1574   return __extension__(__m128d){__u, __u};
1575 }
1576 
1577 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1578 
1579 /// Loads two double-precision values, in reverse order, from an aligned
1580 ///    memory location into a 128-bit vector of [2 x double].
1581 ///
1582 /// \headerfile <x86intrin.h>
1583 ///
1584 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1585 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1586 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1587 ///
1588 /// \param __dp
1589 ///    A 16-byte aligned pointer to an array of double-precision values to be
1590 ///    loaded in reverse order.
1591 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1592 ///    values.
1593 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1594   __m128d __u = *(const __m128d *)__dp;
1595   return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1596 }
1597 
1598 /// Loads a 128-bit floating-point vector of [2 x double] from an
1599 ///    unaligned memory location.
1600 ///
1601 /// \headerfile <x86intrin.h>
1602 ///
1603 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1604 ///
1605 /// \param __dp
1606 ///    A pointer to a 128-bit memory location. The address of the memory
1607 ///    location does not have to be aligned.
1608 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
1609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1610   struct __loadu_pd {
1611     __m128d_u __v;
1612   } __attribute__((__packed__, __may_alias__));
1613   return ((const struct __loadu_pd *)__dp)->__v;
1614 }
1615 
1616 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1617 ///    vector and clears the upper element.
1618 ///
1619 /// \headerfile <x86intrin.h>
1620 ///
1621 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1622 ///
1623 /// \param __a
1624 ///    A pointer to a 64-bit memory location. The address of the memory
1625 ///    location does not have to be aligned.
1626 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
1627 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1628   struct __loadu_si64 {
1629     long long __v;
1630   } __attribute__((__packed__, __may_alias__));
1631   long long __u = ((const struct __loadu_si64 *)__a)->__v;
1632   return __extension__(__m128i)(__v2di){__u, 0LL};
1633 }
1634 
1635 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1636 ///    vector and clears the upper element.
1637 ///
1638 /// \headerfile <x86intrin.h>
1639 ///
1640 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1641 ///
1642 /// \param __a
1643 ///    A pointer to a 32-bit memory location. The address of the memory
1644 ///    location does not have to be aligned.
1645 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
1646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1647   struct __loadu_si32 {
1648     int __v;
1649   } __attribute__((__packed__, __may_alias__));
1650   int __u = ((const struct __loadu_si32 *)__a)->__v;
1651   return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1652 }
1653 
1654 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1655 ///    vector and clears the upper element.
1656 ///
1657 /// \headerfile <x86intrin.h>
1658 ///
1659 /// This intrinsic does not correspond to a specific instruction.
1660 ///
1661 /// \param __a
1662 ///    A pointer to a 16-bit memory location. The address of the memory
1663 ///    location does not have to be aligned.
1664 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
1665 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1666   struct __loadu_si16 {
1667     short __v;
1668   } __attribute__((__packed__, __may_alias__));
1669   short __u = ((const struct __loadu_si16 *)__a)->__v;
1670   return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1671 }
1672 
1673 /// Loads a 64-bit double-precision value to the low element of a
1674 ///    128-bit integer vector and clears the upper element.
1675 ///
1676 /// \headerfile <x86intrin.h>
1677 ///
1678 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1679 ///
1680 /// \param __dp
1681 ///    A pointer to a memory location containing a double-precision value.
1682 ///    The address of the memory location does not have to be aligned.
1683 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
1684 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1685   struct __mm_load_sd_struct {
1686     double __u;
1687   } __attribute__((__packed__, __may_alias__));
1688   double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1689   return __extension__(__m128d){__u, 0};
1690 }
1691 
1692 /// Loads a double-precision value into the high-order bits of a 128-bit
1693 ///    vector of [2 x double]. The low-order bits are copied from the low-order
1694 ///    bits of the first operand.
1695 ///
1696 /// \headerfile <x86intrin.h>
1697 ///
1698 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1699 ///
1700 /// \param __a
1701 ///    A 128-bit vector of [2 x double]. \n
1702 ///    Bits [63:0] are written to bits [63:0] of the result.
1703 /// \param __dp
1704 ///    A pointer to a 64-bit memory location containing a double-precision
1705 ///    floating-point value that is loaded. The loaded value is written to bits
1706 ///    [127:64] of the result. The address of the memory location does not have
1707 ///    to be aligned.
1708 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1710                                                           double const *__dp) {
1711   struct __mm_loadh_pd_struct {
1712     double __u;
1713   } __attribute__((__packed__, __may_alias__));
1714   double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1715   return __extension__(__m128d){__a[0], __u};
1716 }
1717 
1718 /// Loads a double-precision value into the low-order bits of a 128-bit
1719 ///    vector of [2 x double]. The high-order bits are copied from the
1720 ///    high-order bits of the first operand.
1721 ///
1722 /// \headerfile <x86intrin.h>
1723 ///
1724 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1725 ///
1726 /// \param __a
1727 ///    A 128-bit vector of [2 x double]. \n
1728 ///    Bits [127:64] are written to bits [127:64] of the result.
1729 /// \param __dp
1730 ///    A pointer to a 64-bit memory location containing a double-precision
1731 ///    floating-point value that is loaded. The loaded value is written to bits
1732 ///    [63:0] of the result. The address of the memory location does not have to
1733 ///    be aligned.
1734 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1735 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1736                                                           double const *__dp) {
1737   struct __mm_loadl_pd_struct {
1738     double __u;
1739   } __attribute__((__packed__, __may_alias__));
1740   double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1741   return __extension__(__m128d){__u, __a[1]};
1742 }
1743 
1744 /// Constructs a 128-bit floating-point vector of [2 x double] with
1745 ///    unspecified content. This could be used as an argument to another
1746 ///    intrinsic function where the argument is required but the value is not
1747 ///    actually used.
1748 ///
1749 /// \headerfile <x86intrin.h>
1750 ///
1751 /// This intrinsic has no corresponding instruction.
1752 ///
1753 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1754 ///    content.
1755 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1756   return (__m128d)__builtin_ia32_undef128();
1757 }
1758 
1759 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1760 ///    64 bits of the vector are initialized with the specified double-precision
1761 ///    floating-point value. The upper 64 bits are set to zero.
1762 ///
1763 /// \headerfile <x86intrin.h>
1764 ///
1765 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1766 ///
1767 /// \param __w
1768 ///    A double-precision floating-point value used to initialize the lower 64
1769 ///    bits of the result.
1770 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1771 ///    lower 64 bits contain the value of the parameter. The upper 64 bits are
1772 ///    set to zero.
1773 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1774   return __extension__(__m128d){__w, 0.0};
1775 }
1776 
1777 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1778 ///    of the two double-precision floating-point vector elements set to the
1779 ///    specified double-precision floating-point value.
1780 ///
1781 /// \headerfile <x86intrin.h>
1782 ///
1783 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1784 ///
1785 /// \param __w
1786 ///    A double-precision floating-point value used to initialize each vector
1787 ///    element of the result.
1788 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1789 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1790   return __extension__(__m128d){__w, __w};
1791 }
1792 
1793 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1794 ///    of the two double-precision floating-point vector elements set to the
1795 ///    specified double-precision floating-point value.
1796 ///
1797 /// \headerfile <x86intrin.h>
1798 ///
1799 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1800 ///
1801 /// \param __w
1802 ///    A double-precision floating-point value used to initialize each vector
1803 ///    element of the result.
1804 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1805 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1806   return _mm_set1_pd(__w);
1807 }
1808 
1809 /// Constructs a 128-bit floating-point vector of [2 x double]
1810 ///    initialized with the specified double-precision floating-point values.
1811 ///
1812 /// \headerfile <x86intrin.h>
1813 ///
1814 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1815 ///
1816 /// \param __w
1817 ///    A double-precision floating-point value used to initialize the upper 64
1818 ///    bits of the result.
1819 /// \param __x
1820 ///    A double-precision floating-point value used to initialize the lower 64
1821 ///    bits of the result.
1822 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1823 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1824                                                         double __x) {
1825   return __extension__(__m128d){__x, __w};
1826 }
1827 
1828 /// Constructs a 128-bit floating-point vector of [2 x double],
1829 ///    initialized in reverse order with the specified double-precision
1830 ///    floating-point values.
1831 ///
1832 /// \headerfile <x86intrin.h>
1833 ///
1834 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1835 ///
1836 /// \param __w
1837 ///    A double-precision floating-point value used to initialize the lower 64
1838 ///    bits of the result.
1839 /// \param __x
1840 ///    A double-precision floating-point value used to initialize the upper 64
1841 ///    bits of the result.
1842 /// \returns An initialized 128-bit floating-point vector of [2 x double].
1843 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1844                                                          double __x) {
1845   return __extension__(__m128d){__w, __x};
1846 }
1847 
1848 /// Constructs a 128-bit floating-point vector of [2 x double]
1849 ///    initialized to zero.
1850 ///
1851 /// \headerfile <x86intrin.h>
1852 ///
1853 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1854 ///
1855 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1856 ///    all elements set to zero.
1857 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1858   return __extension__(__m128d){0.0, 0.0};
1859 }
1860 
1861 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1862 ///    64 bits are set to the lower 64 bits of the second parameter. The upper
1863 ///    64 bits are set to the upper 64 bits of the first parameter.
1864 ///
1865 /// \headerfile <x86intrin.h>
1866 ///
1867 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1868 ///
1869 /// \param __a
1870 ///    A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1871 ///    upper 64 bits of the result.
1872 /// \param __b
1873 ///    A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1874 ///    lower 64 bits of the result.
1875 /// \returns A 128-bit vector of [2 x double] containing the moved values.
1876 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1877                                                          __m128d __b) {
1878   __a[0] = __b[0];
1879   return __a;
1880 }
1881 
1882 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1883 ///    memory location.
1884 ///
1885 /// \headerfile <x86intrin.h>
1886 ///
1887 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1888 ///
1889 /// \param __dp
1890 ///    A pointer to a 64-bit memory location.
1891 /// \param __a
1892 ///    A 128-bit vector of [2 x double] containing the value to be stored.
1893 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1894                                                        __m128d __a) {
1895   struct __mm_store_sd_struct {
1896     double __u;
1897   } __attribute__((__packed__, __may_alias__));
1898   ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1899 }
1900 
1901 /// Moves packed double-precision values from a 128-bit vector of
1902 ///    [2 x double] to a memory location.
1903 ///
1904 /// \headerfile <x86intrin.h>
1905 ///
1906 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1907 ///
1908 /// \param __dp
1909 ///    A pointer to an aligned memory location that can store two
1910 ///    double-precision values.
1911 /// \param __a
1912 ///    A packed 128-bit vector of [2 x double] containing the values to be
1913 ///    moved.
1914 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1915                                                        __m128d __a) {
1916   *(__m128d *)__dp = __a;
1917 }
1918 
1919 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1920 ///    the upper and lower 64 bits of a memory location.
1921 ///
1922 /// \headerfile <x86intrin.h>
1923 ///
1924 /// This intrinsic corresponds to the
1925 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1926 ///
1927 /// \param __dp
1928 ///    A pointer to a memory location that can store two double-precision
1929 ///    values.
1930 /// \param __a
1931 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1932 ///    of the values in \a __dp.
1933 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1934                                                         __m128d __a) {
1935   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1936   _mm_store_pd(__dp, __a);
1937 }
1938 
1939 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1940 ///    the upper and lower 64 bits of a memory location.
1941 ///
1942 /// \headerfile <x86intrin.h>
1943 ///
1944 /// This intrinsic corresponds to the
1945 ///   <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1946 ///
1947 /// \param __dp
1948 ///    A pointer to a memory location that can store two double-precision
1949 ///    values.
1950 /// \param __a
1951 ///    A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1952 ///    of the values in \a __dp.
1953 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1954                                                         __m128d __a) {
1955   _mm_store1_pd(__dp, __a);
1956 }
1957 
1958 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1959 ///    location.
1960 ///
1961 /// \headerfile <x86intrin.h>
1962 ///
1963 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1964 ///
1965 /// \param __dp
1966 ///    A pointer to a 128-bit memory location. The address of the memory
1967 ///    location does not have to be aligned.
1968 /// \param __a
1969 ///    A 128-bit vector of [2 x double] containing the values to be stored.
1970 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1971                                                         __m128d __a) {
1972   struct __storeu_pd {
1973     __m128d_u __v;
1974   } __attribute__((__packed__, __may_alias__));
1975   ((struct __storeu_pd *)__dp)->__v = __a;
1976 }
1977 
1978 /// Stores two double-precision values, in reverse order, from a 128-bit
1979 ///    vector of [2 x double] to a 16-byte aligned memory location.
1980 ///
1981 /// \headerfile <x86intrin.h>
1982 ///
1983 /// This intrinsic corresponds to a shuffling instruction followed by a
1984 /// <c> VMOVAPD / MOVAPD </c> instruction.
1985 ///
1986 /// \param __dp
1987 ///    A pointer to a 16-byte aligned memory location that can store two
1988 ///    double-precision values.
1989 /// \param __a
1990 ///    A 128-bit vector of [2 x double] containing the values to be reversed and
1991 ///    stored.
1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1993                                                         __m128d __a) {
1994   __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1995   *(__m128d *)__dp = __a;
1996 }
1997 
1998 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1999 ///    memory location.
2000 ///
2001 /// \headerfile <x86intrin.h>
2002 ///
2003 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2004 ///
2005 /// \param __dp
2006 ///    A pointer to a 64-bit memory location.
2007 /// \param __a
2008 ///    A 128-bit vector of [2 x double] containing the value to be stored.
2009 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2010                                                         __m128d __a) {
2011   struct __mm_storeh_pd_struct {
2012     double __u;
2013   } __attribute__((__packed__, __may_alias__));
2014   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2015 }
2016 
2017 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2018 ///    memory location.
2019 ///
2020 /// \headerfile <x86intrin.h>
2021 ///
2022 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2023 ///
2024 /// \param __dp
2025 ///    A pointer to a 64-bit memory location.
2026 /// \param __a
2027 ///    A 128-bit vector of [2 x double] containing the value to be stored.
2028 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2029                                                         __m128d __a) {
2030   struct __mm_storeh_pd_struct {
2031     double __u;
2032   } __attribute__((__packed__, __may_alias__));
2033   ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2034 }
2035 
2036 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2037 ///    saving the lower 8 bits of each sum in the corresponding element of a
2038 ///    128-bit result vector of [16 x i8].
2039 ///
2040 ///    The integer elements of both parameters can be either signed or unsigned.
2041 ///
2042 /// \headerfile <x86intrin.h>
2043 ///
2044 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2045 ///
2046 /// \param __a
2047 ///    A 128-bit vector of [16 x i8].
2048 /// \param __b
2049 ///    A 128-bit vector of [16 x i8].
2050 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2051 ///    parameters.
2052 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2053                                                           __m128i __b) {
2054   return (__m128i)((__v16qu)__a + (__v16qu)__b);
2055 }
2056 
2057 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2058 ///    saving the lower 16 bits of each sum in the corresponding element of a
2059 ///    128-bit result vector of [8 x i16].
2060 ///
2061 ///    The integer elements of both parameters can be either signed or unsigned.
2062 ///
2063 /// \headerfile <x86intrin.h>
2064 ///
2065 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2066 ///
2067 /// \param __a
2068 ///    A 128-bit vector of [8 x i16].
2069 /// \param __b
2070 ///    A 128-bit vector of [8 x i16].
2071 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2072 ///    parameters.
2073 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2074                                                            __m128i __b) {
2075   return (__m128i)((__v8hu)__a + (__v8hu)__b);
2076 }
2077 
2078 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2079 ///    saving the lower 32 bits of each sum in the corresponding element of a
2080 ///    128-bit result vector of [4 x i32].
2081 ///
2082 ///    The integer elements of both parameters can be either signed or unsigned.
2083 ///
2084 /// \headerfile <x86intrin.h>
2085 ///
2086 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2087 ///
2088 /// \param __a
2089 ///    A 128-bit vector of [4 x i32].
2090 /// \param __b
2091 ///    A 128-bit vector of [4 x i32].
2092 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2093 ///    parameters.
2094 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2095                                                            __m128i __b) {
2096   return (__m128i)((__v4su)__a + (__v4su)__b);
2097 }
2098 
2099 /// Adds two signed or unsigned 64-bit integer values, returning the
2100 ///    lower 64 bits of the sum.
2101 ///
2102 /// \headerfile <x86intrin.h>
2103 ///
2104 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2105 ///
2106 /// \param __a
2107 ///    A 64-bit integer.
2108 /// \param __b
2109 ///    A 64-bit integer.
2110 /// \returns A 64-bit integer containing the sum of both parameters.
2111 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2112                                                             __m64 __b) {
2113   return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2114 }
2115 
2116 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2117 ///    saving the lower 64 bits of each sum in the corresponding element of a
2118 ///    128-bit result vector of [2 x i64].
2119 ///
2120 ///    The integer elements of both parameters can be either signed or unsigned.
2121 ///
2122 /// \headerfile <x86intrin.h>
2123 ///
2124 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2125 ///
2126 /// \param __a
2127 ///    A 128-bit vector of [2 x i64].
2128 /// \param __b
2129 ///    A 128-bit vector of [2 x i64].
2130 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2131 ///    parameters.
2132 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2133                                                            __m128i __b) {
2134   return (__m128i)((__v2du)__a + (__v2du)__b);
2135 }
2136 
2137 /// Adds, with saturation, the corresponding elements of two 128-bit
2138 ///    signed [16 x i8] vectors, saving each sum in the corresponding element
2139 ///    of a 128-bit result vector of [16 x i8].
2140 ///
2141 ///    Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2142 ///    less than 0x80 are saturated to 0x80.
2143 ///
2144 /// \headerfile <x86intrin.h>
2145 ///
2146 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2147 ///
2148 /// \param __a
2149 ///    A 128-bit signed [16 x i8] vector.
2150 /// \param __b
2151 ///    A 128-bit signed [16 x i8] vector.
2152 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2153 ///    both parameters.
2154 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2155                                                            __m128i __b) {
2156   return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2157 }
2158 
2159 /// Adds, with saturation, the corresponding elements of two 128-bit
2160 ///    signed [8 x i16] vectors, saving each sum in the corresponding element
2161 ///    of a 128-bit result vector of [8 x i16].
2162 ///
2163 ///    Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2164 ///    less than 0x8000 are saturated to 0x8000.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2169 ///
2170 /// \param __a
2171 ///    A 128-bit signed [8 x i16] vector.
2172 /// \param __b
2173 ///    A 128-bit signed [8 x i16] vector.
2174 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2175 ///    both parameters.
2176 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2177                                                             __m128i __b) {
2178   return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2179 }
2180 
2181 /// Adds, with saturation, the corresponding elements of two 128-bit
2182 ///    unsigned [16 x i8] vectors, saving each sum in the corresponding element
2183 ///    of a 128-bit result vector of [16 x i8].
2184 ///
2185 ///    Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2186 ///    saturated to 0x00.
2187 ///
2188 /// \headerfile <x86intrin.h>
2189 ///
2190 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2191 ///
2192 /// \param __a
2193 ///    A 128-bit unsigned [16 x i8] vector.
2194 /// \param __b
2195 ///    A 128-bit unsigned [16 x i8] vector.
2196 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2197 ///    of both parameters.
2198 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2199                                                            __m128i __b) {
2200   return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2201 }
2202 
2203 /// Adds, with saturation, the corresponding elements of two 128-bit
2204 ///    unsigned [8 x i16] vectors, saving each sum in the corresponding element
2205 ///    of a 128-bit result vector of [8 x i16].
2206 ///
2207 ///    Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2208 ///    are saturated to 0x0000.
2209 ///
2210 /// \headerfile <x86intrin.h>
2211 ///
2212 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2213 ///
2214 /// \param __a
2215 ///    A 128-bit unsigned [8 x i16] vector.
2216 /// \param __b
2217 ///    A 128-bit unsigned [8 x i16] vector.
2218 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2219 ///    of both parameters.
2220 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2221                                                             __m128i __b) {
2222   return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2223 }
2224 
2225 /// Computes the rounded averages of corresponding elements of two
2226 ///    128-bit unsigned [16 x i8] vectors, saving each result in the
2227 ///    corresponding element of a 128-bit result vector of [16 x i8].
2228 ///
2229 /// \headerfile <x86intrin.h>
2230 ///
2231 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2232 ///
2233 /// \param __a
2234 ///    A 128-bit unsigned [16 x i8] vector.
2235 /// \param __b
2236 ///    A 128-bit unsigned [16 x i8] vector.
2237 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2238 ///    averages of both parameters.
2239 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2240                                                           __m128i __b) {
2241   return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2242 }
2243 
2244 /// Computes the rounded averages of corresponding elements of two
2245 ///    128-bit unsigned [8 x i16] vectors, saving each result in the
2246 ///    corresponding element of a 128-bit result vector of [8 x i16].
2247 ///
2248 /// \headerfile <x86intrin.h>
2249 ///
2250 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2251 ///
2252 /// \param __a
2253 ///    A 128-bit unsigned [8 x i16] vector.
2254 /// \param __b
2255 ///    A 128-bit unsigned [8 x i16] vector.
2256 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2257 ///    averages of both parameters.
2258 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2259                                                            __m128i __b) {
2260   return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2261 }
2262 
2263 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2264 ///    vectors, producing eight intermediate 32-bit signed integer products, and
2265 ///    adds the consecutive pairs of 32-bit products to form a 128-bit signed
2266 ///    [4 x i32] vector.
2267 ///
2268 ///    For example, bits [15:0] of both parameters are multiplied producing a
2269 ///    32-bit product, bits [31:16] of both parameters are multiplied producing
2270 ///    a 32-bit product, and the sum of those two products becomes bits [31:0]
2271 ///    of the result.
2272 ///
2273 /// \headerfile <x86intrin.h>
2274 ///
2275 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2276 ///
2277 /// \param __a
2278 ///    A 128-bit signed [8 x i16] vector.
2279 /// \param __b
2280 ///    A 128-bit signed [8 x i16] vector.
2281 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2282 ///    of both parameters.
2283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2284                                                             __m128i __b) {
2285   return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2286 }
2287 
2288 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2289 ///    vectors, saving the greater value from each comparison in the
2290 ///    corresponding element of a 128-bit result vector of [8 x i16].
2291 ///
2292 /// \headerfile <x86intrin.h>
2293 ///
2294 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2295 ///
2296 /// \param __a
2297 ///    A 128-bit signed [8 x i16] vector.
2298 /// \param __b
2299 ///    A 128-bit signed [8 x i16] vector.
2300 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2301 ///    each comparison.
2302 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2303                                                            __m128i __b) {
2304   return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2305 }
2306 
2307 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2308 ///    vectors, saving the greater value from each comparison in the
2309 ///    corresponding element of a 128-bit result vector of [16 x i8].
2310 ///
2311 /// \headerfile <x86intrin.h>
2312 ///
2313 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2314 ///
2315 /// \param __a
2316 ///    A 128-bit unsigned [16 x i8] vector.
2317 /// \param __b
2318 ///    A 128-bit unsigned [16 x i8] vector.
2319 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2320 ///    each comparison.
2321 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2322                                                           __m128i __b) {
2323   return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2324 }
2325 
2326 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2327 ///    vectors, saving the smaller value from each comparison in the
2328 ///    corresponding element of a 128-bit result vector of [8 x i16].
2329 ///
2330 /// \headerfile <x86intrin.h>
2331 ///
2332 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2333 ///
2334 /// \param __a
2335 ///    A 128-bit signed [8 x i16] vector.
2336 /// \param __b
2337 ///    A 128-bit signed [8 x i16] vector.
2338 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2339 ///    each comparison.
2340 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2341                                                            __m128i __b) {
2342   return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2343 }
2344 
2345 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2346 ///    vectors, saving the smaller value from each comparison in the
2347 ///    corresponding element of a 128-bit result vector of [16 x i8].
2348 ///
2349 /// \headerfile <x86intrin.h>
2350 ///
2351 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2352 ///
2353 /// \param __a
2354 ///    A 128-bit unsigned [16 x i8] vector.
2355 /// \param __b
2356 ///    A 128-bit unsigned [16 x i8] vector.
2357 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2358 ///    each comparison.
2359 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2360                                                           __m128i __b) {
2361   return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2362 }
2363 
2364 /// Multiplies the corresponding elements of two signed [8 x i16]
2365 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2366 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2367 ///
2368 /// \headerfile <x86intrin.h>
2369 ///
2370 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2371 ///
2372 /// \param __a
2373 ///    A 128-bit signed [8 x i16] vector.
2374 /// \param __b
2375 ///    A 128-bit signed [8 x i16] vector.
2376 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2377 ///    each of the eight 32-bit products.
2378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2379                                                              __m128i __b) {
2380   return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2381 }
2382 
2383 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2384 ///    vectors, saving the upper 16 bits of each 32-bit product in the
2385 ///    corresponding element of a 128-bit unsigned [8 x i16] result vector.
2386 ///
2387 /// \headerfile <x86intrin.h>
2388 ///
2389 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2390 ///
2391 /// \param __a
2392 ///    A 128-bit unsigned [8 x i16] vector.
2393 /// \param __b
2394 ///    A 128-bit unsigned [8 x i16] vector.
2395 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2396 ///    of each of the eight 32-bit products.
2397 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2398                                                              __m128i __b) {
2399   return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2400 }
2401 
2402 /// Multiplies the corresponding elements of two signed [8 x i16]
2403 ///    vectors, saving the lower 16 bits of each 32-bit product in the
2404 ///    corresponding element of a 128-bit signed [8 x i16] result vector.
2405 ///
2406 /// \headerfile <x86intrin.h>
2407 ///
2408 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2409 ///
2410 /// \param __a
2411 ///    A 128-bit signed [8 x i16] vector.
2412 /// \param __b
2413 ///    A 128-bit signed [8 x i16] vector.
2414 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2415 ///    each of the eight 32-bit products.
2416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2417                                                              __m128i __b) {
2418   return (__m128i)((__v8hu)__a * (__v8hu)__b);
2419 }
2420 
2421 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2422 ///    of the two 64-bit integer vectors and returns the 64-bit unsigned
2423 ///    product.
2424 ///
2425 /// \headerfile <x86intrin.h>
2426 ///
2427 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2428 ///
2429 /// \param __a
2430 ///    A 64-bit integer containing one of the source operands.
2431 /// \param __b
2432 ///    A 64-bit integer containing one of the source operands.
2433 /// \returns A 64-bit integer vector containing the product of both operands.
2434 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2435                                                             __m64 __b) {
2436   return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2437 }
2438 
2439 /// Multiplies 32-bit unsigned integer values contained in the lower
2440 ///    bits of the corresponding elements of two [2 x i64] vectors, and returns
2441 ///    the 64-bit products in the corresponding elements of a [2 x i64] vector.
2442 ///
2443 /// \headerfile <x86intrin.h>
2444 ///
2445 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2446 ///
2447 /// \param __a
2448 ///    A [2 x i64] vector containing one of the source operands.
2449 /// \param __b
2450 ///    A [2 x i64] vector containing one of the source operands.
2451 /// \returns A [2 x i64] vector containing the product of both operands.
2452 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2453                                                            __m128i __b) {
2454   return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2455 }
2456 
2457 /// Computes the absolute differences of corresponding 8-bit integer
2458 ///    values in two 128-bit vectors. Sums the first 8 absolute differences, and
2459 ///    separately sums the second 8 absolute differences. Packs these two
2460 ///    unsigned 16-bit integer sums into the upper and lower elements of a
2461 ///    [2 x i64] vector.
2462 ///
2463 /// \headerfile <x86intrin.h>
2464 ///
2465 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2466 ///
2467 /// \param __a
2468 ///    A 128-bit integer vector containing one of the source operands.
2469 /// \param __b
2470 ///    A 128-bit integer vector containing one of the source operands.
2471 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2472 ///    differences between both operands.
2473 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2474                                                           __m128i __b) {
2475   return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2476 }
2477 
2478 /// Subtracts the corresponding 8-bit integer values in the operands.
2479 ///
2480 /// \headerfile <x86intrin.h>
2481 ///
2482 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2483 ///
2484 /// \param __a
2485 ///    A 128-bit integer vector containing the minuends.
2486 /// \param __b
2487 ///    A 128-bit integer vector containing the subtrahends.
2488 /// \returns A 128-bit integer vector containing the differences of the values
2489 ///    in the operands.
2490 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2491                                                           __m128i __b) {
2492   return (__m128i)((__v16qu)__a - (__v16qu)__b);
2493 }
2494 
2495 /// Subtracts the corresponding 16-bit integer values in the operands.
2496 ///
2497 /// \headerfile <x86intrin.h>
2498 ///
2499 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2500 ///
2501 /// \param __a
2502 ///    A 128-bit integer vector containing the minuends.
2503 /// \param __b
2504 ///    A 128-bit integer vector containing the subtrahends.
2505 /// \returns A 128-bit integer vector containing the differences of the values
2506 ///    in the operands.
2507 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2508                                                            __m128i __b) {
2509   return (__m128i)((__v8hu)__a - (__v8hu)__b);
2510 }
2511 
2512 /// Subtracts the corresponding 32-bit integer values in the operands.
2513 ///
2514 /// \headerfile <x86intrin.h>
2515 ///
2516 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2517 ///
2518 /// \param __a
2519 ///    A 128-bit integer vector containing the minuends.
2520 /// \param __b
2521 ///    A 128-bit integer vector containing the subtrahends.
2522 /// \returns A 128-bit integer vector containing the differences of the values
2523 ///    in the operands.
2524 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2525                                                            __m128i __b) {
2526   return (__m128i)((__v4su)__a - (__v4su)__b);
2527 }
2528 
2529 /// Subtracts signed or unsigned 64-bit integer values and writes the
2530 ///    difference to the corresponding bits in the destination.
2531 ///
2532 /// \headerfile <x86intrin.h>
2533 ///
2534 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2535 ///
2536 /// \param __a
2537 ///    A 64-bit integer vector containing the minuend.
2538 /// \param __b
2539 ///    A 64-bit integer vector containing the subtrahend.
2540 /// \returns A 64-bit integer vector containing the difference of the values in
2541 ///    the operands.
2542 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2543                                                             __m64 __b) {
2544   return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2545 }
2546 
2547 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2548 ///
2549 /// \headerfile <x86intrin.h>
2550 ///
2551 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2552 ///
2553 /// \param __a
2554 ///    A 128-bit integer vector containing the minuends.
2555 /// \param __b
2556 ///    A 128-bit integer vector containing the subtrahends.
2557 /// \returns A 128-bit integer vector containing the differences of the values
2558 ///    in the operands.
2559 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2560                                                            __m128i __b) {
2561   return (__m128i)((__v2du)__a - (__v2du)__b);
2562 }
2563 
2564 /// Subtracts, with saturation, corresponding 8-bit signed integer values in
2565 ///    the input and returns the differences in the corresponding bytes in the
2566 ///    destination.
2567 ///
2568 ///    Differences greater than 0x7F are saturated to 0x7F, and differences
2569 ///    less than 0x80 are saturated to 0x80.
2570 ///
2571 /// \headerfile <x86intrin.h>
2572 ///
2573 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2574 ///
2575 /// \param __a
2576 ///    A 128-bit integer vector containing the minuends.
2577 /// \param __b
2578 ///    A 128-bit integer vector containing the subtrahends.
2579 /// \returns A 128-bit integer vector containing the differences of the values
2580 ///    in the operands.
2581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2582                                                            __m128i __b) {
2583   return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2584 }
2585 
2586 /// Subtracts, with saturation, corresponding 16-bit signed integer values in
2587 ///    the input and returns the differences in the corresponding bytes in the
2588 ///    destination.
2589 ///
2590 ///    Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2591 ///    than 0x8000 are saturated to 0x8000.
2592 ///
2593 /// \headerfile <x86intrin.h>
2594 ///
2595 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2596 ///
2597 /// \param __a
2598 ///    A 128-bit integer vector containing the minuends.
2599 /// \param __b
2600 ///    A 128-bit integer vector containing the subtrahends.
2601 /// \returns A 128-bit integer vector containing the differences of the values
2602 ///    in the operands.
2603 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2604                                                             __m128i __b) {
2605   return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2606 }
2607 
2608 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2609 ///    the input and returns the differences in the corresponding bytes in the
2610 ///    destination.
2611 ///
2612 ///    Differences less than 0x00 are saturated to 0x00.
2613 ///
2614 /// \headerfile <x86intrin.h>
2615 ///
2616 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2617 ///
2618 /// \param __a
2619 ///    A 128-bit integer vector containing the minuends.
2620 /// \param __b
2621 ///    A 128-bit integer vector containing the subtrahends.
2622 /// \returns A 128-bit integer vector containing the unsigned integer
2623 ///    differences of the values in the operands.
2624 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2625                                                            __m128i __b) {
2626   return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2627 }
2628 
2629 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2630 ///    the input and returns the differences in the corresponding bytes in the
2631 ///    destination.
2632 ///
2633 ///    Differences less than 0x0000 are saturated to 0x0000.
2634 ///
2635 /// \headerfile <x86intrin.h>
2636 ///
2637 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2638 ///
2639 /// \param __a
2640 ///    A 128-bit integer vector containing the minuends.
2641 /// \param __b
2642 ///    A 128-bit integer vector containing the subtrahends.
2643 /// \returns A 128-bit integer vector containing the unsigned integer
2644 ///    differences of the values in the operands.
2645 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2646                                                             __m128i __b) {
2647   return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2648 }
2649 
2650 /// Performs a bitwise AND of two 128-bit integer vectors.
2651 ///
2652 /// \headerfile <x86intrin.h>
2653 ///
2654 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2655 ///
2656 /// \param __a
2657 ///    A 128-bit integer vector containing one of the source operands.
2658 /// \param __b
2659 ///    A 128-bit integer vector containing one of the source operands.
2660 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2661 ///    in both operands.
2662 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2663                                                            __m128i __b) {
2664   return (__m128i)((__v2du)__a & (__v2du)__b);
2665 }
2666 
2667 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2668 ///    one's complement of the values contained in the first source operand.
2669 ///
2670 /// \headerfile <x86intrin.h>
2671 ///
2672 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2673 ///
2674 /// \param __a
2675 ///    A 128-bit vector containing the left source operand. The one's complement
2676 ///    of this value is used in the bitwise AND.
2677 /// \param __b
2678 ///    A 128-bit vector containing the right source operand.
2679 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2680 ///    complement of the first operand and the values in the second operand.
2681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2682                                                               __m128i __b) {
2683   return (__m128i)(~(__v2du)__a & (__v2du)__b);
2684 }
2685 /// Performs a bitwise OR of two 128-bit integer vectors.
2686 ///
2687 /// \headerfile <x86intrin.h>
2688 ///
2689 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2690 ///
2691 /// \param __a
2692 ///    A 128-bit integer vector containing one of the source operands.
2693 /// \param __b
2694 ///    A 128-bit integer vector containing one of the source operands.
2695 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2696 ///    in both operands.
2697 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2698                                                           __m128i __b) {
2699   return (__m128i)((__v2du)__a | (__v2du)__b);
2700 }
2701 
2702 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2707 ///
2708 /// \param __a
2709 ///    A 128-bit integer vector containing one of the source operands.
2710 /// \param __b
2711 ///    A 128-bit integer vector containing one of the source operands.
2712 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2713 ///    values in both operands.
2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2715                                                            __m128i __b) {
2716   return (__m128i)((__v2du)__a ^ (__v2du)__b);
2717 }
2718 
2719 /// Left-shifts the 128-bit integer vector operand by the specified
2720 ///    number of bytes. Low-order bits are cleared.
2721 ///
2722 /// \headerfile <x86intrin.h>
2723 ///
2724 /// \code
2725 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2726 /// \endcode
2727 ///
2728 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2729 ///
2730 /// \param a
2731 ///    A 128-bit integer vector containing the source operand.
2732 /// \param imm
2733 ///    An immediate value specifying the number of bytes to left-shift operand
2734 ///    \a a.
2735 /// \returns A 128-bit integer vector containing the left-shifted value.
2736 #define _mm_slli_si128(a, imm)                                                 \
2737   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2738                                                 (int)(imm)))
2739 
2740 #define _mm_bslli_si128(a, imm)                                                \
2741   ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a),          \
2742                                                 (int)(imm)))
2743 
2744 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2745 ///    by the specified number of bits. Low-order bits are cleared.
2746 ///
2747 /// \headerfile <x86intrin.h>
2748 ///
2749 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2750 ///
2751 /// \param __a
2752 ///    A 128-bit integer vector containing the source operand.
2753 /// \param __count
2754 ///    An integer value specifying the number of bits to left-shift each value
2755 ///    in operand \a __a.
2756 /// \returns A 128-bit integer vector containing the left-shifted values.
2757 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2758                                                             int __count) {
2759   return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2760 }
2761 
2762 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2763 ///    by the specified number of bits. Low-order bits are cleared.
2764 ///
2765 /// \headerfile <x86intrin.h>
2766 ///
2767 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2768 ///
2769 /// \param __a
2770 ///    A 128-bit integer vector containing the source operand.
2771 /// \param __count
2772 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2773 ///    to left-shift each value in operand \a __a.
2774 /// \returns A 128-bit integer vector containing the left-shifted values.
2775 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2776                                                            __m128i __count) {
2777   return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2778 }
2779 
2780 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2781 ///    by the specified number of bits. Low-order bits are cleared.
2782 ///
2783 /// \headerfile <x86intrin.h>
2784 ///
2785 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2786 ///
2787 /// \param __a
2788 ///    A 128-bit integer vector containing the source operand.
2789 /// \param __count
2790 ///    An integer value specifying the number of bits to left-shift each value
2791 ///    in operand \a __a.
2792 /// \returns A 128-bit integer vector containing the left-shifted values.
2793 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2794                                                             int __count) {
2795   return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2796 }
2797 
2798 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2799 ///    by the specified number of bits. Low-order bits are cleared.
2800 ///
2801 /// \headerfile <x86intrin.h>
2802 ///
2803 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2804 ///
2805 /// \param __a
2806 ///    A 128-bit integer vector containing the source operand.
2807 /// \param __count
2808 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2809 ///    to left-shift each value in operand \a __a.
2810 /// \returns A 128-bit integer vector containing the left-shifted values.
2811 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2812                                                            __m128i __count) {
2813   return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2814 }
2815 
2816 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2817 ///    by the specified number of bits. Low-order bits are cleared.
2818 ///
2819 /// \headerfile <x86intrin.h>
2820 ///
2821 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2822 ///
2823 /// \param __a
2824 ///    A 128-bit integer vector containing the source operand.
2825 /// \param __count
2826 ///    An integer value specifying the number of bits to left-shift each value
2827 ///    in operand \a __a.
2828 /// \returns A 128-bit integer vector containing the left-shifted values.
2829 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2830                                                             int __count) {
2831   return __builtin_ia32_psllqi128((__v2di)__a, __count);
2832 }
2833 
2834 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2835 ///    by the specified number of bits. Low-order bits are cleared.
2836 ///
2837 /// \headerfile <x86intrin.h>
2838 ///
2839 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2840 ///
2841 /// \param __a
2842 ///    A 128-bit integer vector containing the source operand.
2843 /// \param __count
2844 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2845 ///    to left-shift each value in operand \a __a.
2846 /// \returns A 128-bit integer vector containing the left-shifted values.
2847 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2848                                                            __m128i __count) {
2849   return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2850 }
2851 
2852 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2853 ///    by the specified number of bits. High-order bits are filled with the sign
2854 ///    bit of the initial value.
2855 ///
2856 /// \headerfile <x86intrin.h>
2857 ///
2858 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2859 ///
2860 /// \param __a
2861 ///    A 128-bit integer vector containing the source operand.
2862 /// \param __count
2863 ///    An integer value specifying the number of bits to right-shift each value
2864 ///    in operand \a __a.
2865 /// \returns A 128-bit integer vector containing the right-shifted values.
2866 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2867                                                             int __count) {
2868   return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2869 }
2870 
2871 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2872 ///    by the specified number of bits. High-order bits are filled with the sign
2873 ///    bit of the initial value.
2874 ///
2875 /// \headerfile <x86intrin.h>
2876 ///
2877 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2878 ///
2879 /// \param __a
2880 ///    A 128-bit integer vector containing the source operand.
2881 /// \param __count
2882 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2883 ///    to right-shift each value in operand \a __a.
2884 /// \returns A 128-bit integer vector containing the right-shifted values.
2885 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2886                                                            __m128i __count) {
2887   return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2888 }
2889 
2890 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2891 ///    by the specified number of bits. High-order bits are filled with the sign
2892 ///    bit of the initial value.
2893 ///
2894 /// \headerfile <x86intrin.h>
2895 ///
2896 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2897 ///
2898 /// \param __a
2899 ///    A 128-bit integer vector containing the source operand.
2900 /// \param __count
2901 ///    An integer value specifying the number of bits to right-shift each value
2902 ///    in operand \a __a.
2903 /// \returns A 128-bit integer vector containing the right-shifted values.
2904 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2905                                                             int __count) {
2906   return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2907 }
2908 
2909 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2910 ///    by the specified number of bits. High-order bits are filled with the sign
2911 ///    bit of the initial value.
2912 ///
2913 /// \headerfile <x86intrin.h>
2914 ///
2915 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2916 ///
2917 /// \param __a
2918 ///    A 128-bit integer vector containing the source operand.
2919 /// \param __count
2920 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2921 ///    to right-shift each value in operand \a __a.
2922 /// \returns A 128-bit integer vector containing the right-shifted values.
2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2924                                                            __m128i __count) {
2925   return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2926 }
2927 
2928 /// Right-shifts the 128-bit integer vector operand by the specified
2929 ///    number of bytes. High-order bits are cleared.
2930 ///
2931 /// \headerfile <x86intrin.h>
2932 ///
2933 /// \code
2934 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2935 /// \endcode
2936 ///
2937 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2938 ///
2939 /// \param a
2940 ///    A 128-bit integer vector containing the source operand.
2941 /// \param imm
2942 ///    An immediate value specifying the number of bytes to right-shift operand
2943 ///    \a a.
2944 /// \returns A 128-bit integer vector containing the right-shifted value.
2945 #define _mm_srli_si128(a, imm)                                                 \
2946   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2947                                                 (int)(imm)))
2948 
2949 #define _mm_bsrli_si128(a, imm)                                                \
2950   ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a),          \
2951                                                 (int)(imm)))
2952 
2953 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2954 ///    operand by the specified number of bits. High-order bits are cleared.
2955 ///
2956 /// \headerfile <x86intrin.h>
2957 ///
2958 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2959 ///
2960 /// \param __a
2961 ///    A 128-bit integer vector containing the source operand.
2962 /// \param __count
2963 ///    An integer value specifying the number of bits to right-shift each value
2964 ///    in operand \a __a.
2965 /// \returns A 128-bit integer vector containing the right-shifted values.
2966 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2967                                                             int __count) {
2968   return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2969 }
2970 
2971 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2972 ///    operand by the specified number of bits. High-order bits are cleared.
2973 ///
2974 /// \headerfile <x86intrin.h>
2975 ///
2976 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2977 ///
2978 /// \param __a
2979 ///    A 128-bit integer vector containing the source operand.
2980 /// \param __count
2981 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
2982 ///    to right-shift each value in operand \a __a.
2983 /// \returns A 128-bit integer vector containing the right-shifted values.
2984 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2985                                                            __m128i __count) {
2986   return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2987 }
2988 
2989 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2990 ///    operand by the specified number of bits. High-order bits are cleared.
2991 ///
2992 /// \headerfile <x86intrin.h>
2993 ///
2994 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2995 ///
2996 /// \param __a
2997 ///    A 128-bit integer vector containing the source operand.
2998 /// \param __count
2999 ///    An integer value specifying the number of bits to right-shift each value
3000 ///    in operand \a __a.
3001 /// \returns A 128-bit integer vector containing the right-shifted values.
3002 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3003                                                             int __count) {
3004   return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3005 }
3006 
3007 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3008 ///    operand by the specified number of bits. High-order bits are cleared.
3009 ///
3010 /// \headerfile <x86intrin.h>
3011 ///
3012 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3013 ///
3014 /// \param __a
3015 ///    A 128-bit integer vector containing the source operand.
3016 /// \param __count
3017 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3018 ///    to right-shift each value in operand \a __a.
3019 /// \returns A 128-bit integer vector containing the right-shifted values.
3020 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3021                                                            __m128i __count) {
3022   return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3023 }
3024 
3025 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3026 ///    operand by the specified number of bits. High-order bits are cleared.
3027 ///
3028 /// \headerfile <x86intrin.h>
3029 ///
3030 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3031 ///
3032 /// \param __a
3033 ///    A 128-bit integer vector containing the source operand.
3034 /// \param __count
3035 ///    An integer value specifying the number of bits to right-shift each value
3036 ///    in operand \a __a.
3037 /// \returns A 128-bit integer vector containing the right-shifted values.
3038 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3039                                                             int __count) {
3040   return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3041 }
3042 
3043 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3044 ///    operand by the specified number of bits. High-order bits are cleared.
3045 ///
3046 /// \headerfile <x86intrin.h>
3047 ///
3048 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3049 ///
3050 /// \param __a
3051 ///    A 128-bit integer vector containing the source operand.
3052 /// \param __count
3053 ///    A 128-bit integer vector in which bits [63:0] specify the number of bits
3054 ///    to right-shift each value in operand \a __a.
3055 /// \returns A 128-bit integer vector containing the right-shifted values.
3056 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3057                                                            __m128i __count) {
3058   return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3059 }
3060 
3061 /// Compares each of the corresponding 8-bit values of the 128-bit
3062 ///    integer vectors for equality.
3063 ///
3064 ///    Each comparison returns 0x0 for false, 0xFF for true.
3065 ///
3066 /// \headerfile <x86intrin.h>
3067 ///
3068 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3069 ///
3070 /// \param __a
3071 ///    A 128-bit integer vector.
3072 /// \param __b
3073 ///    A 128-bit integer vector.
3074 /// \returns A 128-bit integer vector containing the comparison results.
3075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3076                                                             __m128i __b) {
3077   return (__m128i)((__v16qi)__a == (__v16qi)__b);
3078 }
3079 
3080 /// Compares each of the corresponding 16-bit values of the 128-bit
3081 ///    integer vectors for equality.
3082 ///
3083 ///    Each comparison returns 0x0 for false, 0xFFFF for true.
3084 ///
3085 /// \headerfile <x86intrin.h>
3086 ///
3087 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3088 ///
3089 /// \param __a
3090 ///    A 128-bit integer vector.
3091 /// \param __b
3092 ///    A 128-bit integer vector.
3093 /// \returns A 128-bit integer vector containing the comparison results.
3094 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3095                                                              __m128i __b) {
3096   return (__m128i)((__v8hi)__a == (__v8hi)__b);
3097 }
3098 
3099 /// Compares each of the corresponding 32-bit values of the 128-bit
3100 ///    integer vectors for equality.
3101 ///
3102 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3103 ///
3104 /// \headerfile <x86intrin.h>
3105 ///
3106 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3107 ///
3108 /// \param __a
3109 ///    A 128-bit integer vector.
3110 /// \param __b
3111 ///    A 128-bit integer vector.
3112 /// \returns A 128-bit integer vector containing the comparison results.
3113 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3114                                                              __m128i __b) {
3115   return (__m128i)((__v4si)__a == (__v4si)__b);
3116 }
3117 
3118 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3119 ///    integer vectors to determine if the values in the first operand are
3120 ///    greater than those in the second operand.
3121 ///
3122 ///    Each comparison returns 0x0 for false, 0xFF for true.
3123 ///
3124 /// \headerfile <x86intrin.h>
3125 ///
3126 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3127 ///
3128 /// \param __a
3129 ///    A 128-bit integer vector.
3130 /// \param __b
3131 ///    A 128-bit integer vector.
3132 /// \returns A 128-bit integer vector containing the comparison results.
3133 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3134                                                             __m128i __b) {
3135   /* This function always performs a signed comparison, but __v16qi is a char
3136      which may be signed or unsigned, so use __v16qs. */
3137   return (__m128i)((__v16qs)__a > (__v16qs)__b);
3138 }
3139 
3140 /// Compares each of the corresponding signed 16-bit values of the
3141 ///    128-bit integer vectors to determine if the values in the first operand
3142 ///    are greater than those in the second operand.
3143 ///
3144 ///    Each comparison returns 0x0 for false, 0xFFFF for true.
3145 ///
3146 /// \headerfile <x86intrin.h>
3147 ///
3148 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3149 ///
3150 /// \param __a
3151 ///    A 128-bit integer vector.
3152 /// \param __b
3153 ///    A 128-bit integer vector.
3154 /// \returns A 128-bit integer vector containing the comparison results.
3155 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3156                                                              __m128i __b) {
3157   return (__m128i)((__v8hi)__a > (__v8hi)__b);
3158 }
3159 
3160 /// Compares each of the corresponding signed 32-bit values of the
3161 ///    128-bit integer vectors to determine if the values in the first operand
3162 ///    are greater than those in the second operand.
3163 ///
3164 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3165 ///
3166 /// \headerfile <x86intrin.h>
3167 ///
3168 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3169 ///
3170 /// \param __a
3171 ///    A 128-bit integer vector.
3172 /// \param __b
3173 ///    A 128-bit integer vector.
3174 /// \returns A 128-bit integer vector containing the comparison results.
3175 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3176                                                              __m128i __b) {
3177   return (__m128i)((__v4si)__a > (__v4si)__b);
3178 }
3179 
3180 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3181 ///    integer vectors to determine if the values in the first operand are less
3182 ///    than those in the second operand.
3183 ///
3184 ///    Each comparison returns 0x0 for false, 0xFF for true.
3185 ///
3186 /// \headerfile <x86intrin.h>
3187 ///
3188 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3189 ///
3190 /// \param __a
3191 ///    A 128-bit integer vector.
3192 /// \param __b
3193 ///    A 128-bit integer vector.
3194 /// \returns A 128-bit integer vector containing the comparison results.
3195 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3196                                                             __m128i __b) {
3197   return _mm_cmpgt_epi8(__b, __a);
3198 }
3199 
3200 /// Compares each of the corresponding signed 16-bit values of the
3201 ///    128-bit integer vectors to determine if the values in the first operand
3202 ///    are less than those in the second operand.
3203 ///
3204 ///    Each comparison returns 0x0 for false, 0xFFFF for true.
3205 ///
3206 /// \headerfile <x86intrin.h>
3207 ///
3208 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3209 ///
3210 /// \param __a
3211 ///    A 128-bit integer vector.
3212 /// \param __b
3213 ///    A 128-bit integer vector.
3214 /// \returns A 128-bit integer vector containing the comparison results.
3215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3216                                                              __m128i __b) {
3217   return _mm_cmpgt_epi16(__b, __a);
3218 }
3219 
3220 /// Compares each of the corresponding signed 32-bit values of the
3221 ///    128-bit integer vectors to determine if the values in the first operand
3222 ///    are less than those in the second operand.
3223 ///
3224 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3225 ///
3226 /// \headerfile <x86intrin.h>
3227 ///
3228 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3229 ///
3230 /// \param __a
3231 ///    A 128-bit integer vector.
3232 /// \param __b
3233 ///    A 128-bit integer vector.
3234 /// \returns A 128-bit integer vector containing the comparison results.
3235 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3236                                                              __m128i __b) {
3237   return _mm_cmpgt_epi32(__b, __a);
3238 }
3239 
3240 #ifdef __x86_64__
3241 /// Converts a 64-bit signed integer value from the second operand into a
3242 ///    double-precision value and returns it in the lower element of a [2 x
3243 ///    double] vector; the upper element of the returned vector is copied from
3244 ///    the upper element of the first operand.
3245 ///
3246 /// \headerfile <x86intrin.h>
3247 ///
3248 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3249 ///
3250 /// \param __a
3251 ///    A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3252 ///    copied to the upper 64 bits of the destination.
3253 /// \param __b
3254 ///    A 64-bit signed integer operand containing the value to be converted.
3255 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3256 ///    converted value of the second operand. The upper 64 bits are copied from
3257 ///    the upper 64 bits of the first operand.
3258 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3259                                                             long long __b) {
3260   __a[0] = __b;
3261   return __a;
3262 }
3263 
3264 /// Converts the first (lower) element of a vector of [2 x double] into a
3265 ///    64-bit signed integer value.
3266 ///
3267 ///    If the converted value does not fit in a 64-bit integer, raises a
3268 ///    floating-point invalid exception. If the exception is masked, returns
3269 ///    the most negative integer.
3270 ///
3271 /// \headerfile <x86intrin.h>
3272 ///
3273 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3274 ///
3275 /// \param __a
3276 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3277 ///    conversion.
3278 /// \returns A 64-bit signed integer containing the converted value.
3279 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3280   return __builtin_ia32_cvtsd2si64((__v2df)__a);
3281 }
3282 
3283 /// Converts the first (lower) element of a vector of [2 x double] into a
3284 ///    64-bit signed truncated (rounded toward zero) integer value.
3285 ///
3286 ///    If a converted value does not fit in a 64-bit integer, raises a
3287 ///    floating-point invalid exception. If the exception is masked, returns
3288 ///    the most negative integer.
3289 ///
3290 /// \headerfile <x86intrin.h>
3291 ///
3292 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3293 ///   instruction.
3294 ///
3295 /// \param __a
3296 ///    A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3297 ///    conversion.
3298 /// \returns A 64-bit signed integer containing the converted value.
3299 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3300   return __builtin_ia32_cvttsd2si64((__v2df)__a);
3301 }
3302 #endif
3303 
3304 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3305 ///
3306 /// \headerfile <x86intrin.h>
3307 ///
3308 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3309 ///
3310 /// \param __a
3311 ///    A 128-bit integer vector.
3312 /// \returns A 128-bit vector of [4 x float] containing the converted values.
3313 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3314   return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3315 }
3316 
3317 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3318 ///
3319 ///    If a converted value does not fit in a 32-bit integer, raises a
3320 ///    floating-point invalid exception. If the exception is masked, returns
3321 ///    the most negative integer.
3322 ///
3323 /// \headerfile <x86intrin.h>
3324 ///
3325 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3326 ///
3327 /// \param __a
3328 ///    A 128-bit vector of [4 x float].
3329 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3330 ///    values.
3331 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3332   return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3333 }
3334 
3335 /// Converts a vector of [4 x float] into four signed truncated (rounded toward
3336 ///    zero) 32-bit integers, returned in a vector of [4 x i32].
3337 ///
3338 ///    If a converted value does not fit in a 32-bit integer, raises a
3339 ///    floating-point invalid exception. If the exception is masked, returns
3340 ///    the most negative integer.
3341 ///
3342 /// \headerfile <x86intrin.h>
3343 ///
3344 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3345 ///   instruction.
3346 ///
3347 /// \param __a
3348 ///    A 128-bit vector of [4 x float].
3349 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
3350 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3351   return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3352 }
3353 
3354 /// Returns a vector of [4 x i32] where the lowest element is the input
3355 ///    operand and the remaining elements are zero.
3356 ///
3357 /// \headerfile <x86intrin.h>
3358 ///
3359 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3360 ///
3361 /// \param __a
3362 ///    A 32-bit signed integer operand.
3363 /// \returns A 128-bit vector of [4 x i32].
3364 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3365   return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3366 }
3367 
3368 /// Returns a vector of [2 x i64] where the lower element is the input
3369 ///    operand and the upper element is zero.
3370 ///
3371 /// \headerfile <x86intrin.h>
3372 ///
3373 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3374 /// in 64-bit mode.
3375 ///
3376 /// \param __a
3377 ///    A 64-bit signed integer operand containing the value to be converted.
3378 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
3379 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3380   return __extension__(__m128i)(__v2di){__a, 0};
3381 }
3382 
3383 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3384 ///    32-bit signed integer value.
3385 ///
3386 /// \headerfile <x86intrin.h>
3387 ///
3388 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3389 ///
3390 /// \param __a
3391 ///    A vector of [4 x i32]. The least significant 32 bits are moved to the
3392 ///    destination.
3393 /// \returns A 32-bit signed integer containing the moved value.
3394 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3395   __v4si __b = (__v4si)__a;
3396   return __b[0];
3397 }
3398 
3399 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3400 ///    64-bit signed integer value.
3401 ///
3402 /// \headerfile <x86intrin.h>
3403 ///
3404 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3405 ///
3406 /// \param __a
3407 ///    A vector of [2 x i64]. The least significant 64 bits are moved to the
3408 ///    destination.
3409 /// \returns A 64-bit signed integer containing the moved value.
3410 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3411   return __a[0];
3412 }
3413 
3414 /// Moves packed integer values from an aligned 128-bit memory location
3415 ///    to elements in a 128-bit integer vector.
3416 ///
3417 /// \headerfile <x86intrin.h>
3418 ///
3419 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3420 ///
3421 /// \param __p
3422 ///    An aligned pointer to a memory location containing integer values.
3423 /// \returns A 128-bit integer vector containing the moved values.
3424 static __inline__ __m128i __DEFAULT_FN_ATTRS
3425 _mm_load_si128(__m128i const *__p) {
3426   return *__p;
3427 }
3428 
3429 /// Moves packed integer values from an unaligned 128-bit memory location
3430 ///    to elements in a 128-bit integer vector.
3431 ///
3432 /// \headerfile <x86intrin.h>
3433 ///
3434 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3435 ///
3436 /// \param __p
3437 ///    A pointer to a memory location containing integer values.
3438 /// \returns A 128-bit integer vector containing the moved values.
3439 static __inline__ __m128i __DEFAULT_FN_ATTRS
3440 _mm_loadu_si128(__m128i_u const *__p) {
3441   struct __loadu_si128 {
3442     __m128i_u __v;
3443   } __attribute__((__packed__, __may_alias__));
3444   return ((const struct __loadu_si128 *)__p)->__v;
3445 }
3446 
3447 /// Returns a vector of [2 x i64] where the lower element is taken from
3448 ///    the lower element of the operand, and the upper element is zero.
3449 ///
3450 /// \headerfile <x86intrin.h>
3451 ///
3452 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3453 ///
3454 /// \param __p
3455 ///    A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3456 ///    the destination.
3457 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3458 ///    moved value. The higher order bits are cleared.
3459 static __inline__ __m128i __DEFAULT_FN_ATTRS
3460 _mm_loadl_epi64(__m128i_u const *__p) {
3461   struct __mm_loadl_epi64_struct {
3462     long long __u;
3463   } __attribute__((__packed__, __may_alias__));
3464   return __extension__(__m128i){
3465       ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3466 }
3467 
3468 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3469 ///    This could be used as an argument to another intrinsic function where the
3470 ///    argument is required but the value is not actually used.
3471 ///
3472 /// \headerfile <x86intrin.h>
3473 ///
3474 /// This intrinsic has no corresponding instruction.
3475 ///
3476 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
3477 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3478   return (__m128i)__builtin_ia32_undef128();
3479 }
3480 
3481 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3482 ///    the specified 64-bit integer values.
3483 ///
3484 /// \headerfile <x86intrin.h>
3485 ///
3486 /// This intrinsic is a utility function and does not correspond to a specific
3487 ///    instruction.
3488 ///
3489 /// \param __q1
3490 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3491 ///    destination vector of [2 x i64].
3492 /// \param __q0
3493 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3494 ///    destination vector of [2 x i64].
3495 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3496 ///    provided in the operands.
3497 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3498                                                             long long __q0) {
3499   return __extension__(__m128i)(__v2di){__q0, __q1};
3500 }
3501 
3502 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3503 ///    the specified 64-bit integer values.
3504 ///
3505 /// \headerfile <x86intrin.h>
3506 ///
3507 /// This intrinsic is a utility function and does not correspond to a specific
3508 ///    instruction.
3509 ///
3510 /// \param __q1
3511 ///    A 64-bit integer value used to initialize the upper 64 bits of the
3512 ///    destination vector of [2 x i64].
3513 /// \param __q0
3514 ///    A 64-bit integer value used to initialize the lower 64 bits of the
3515 ///    destination vector of [2 x i64].
3516 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3517 ///    provided in the operands.
3518 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3519                                                            __m64 __q0) {
3520   return _mm_set_epi64x((long long)__q1, (long long)__q0);
3521 }
3522 
3523 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3524 ///    the specified 32-bit integer values.
3525 ///
3526 /// \headerfile <x86intrin.h>
3527 ///
3528 /// This intrinsic is a utility function and does not correspond to a specific
3529 ///    instruction.
3530 ///
3531 /// \param __i3
3532 ///    A 32-bit integer value used to initialize bits [127:96] of the
3533 ///    destination vector.
3534 /// \param __i2
3535 ///    A 32-bit integer value used to initialize bits [95:64] of the destination
3536 ///    vector.
3537 /// \param __i1
3538 ///    A 32-bit integer value used to initialize bits [63:32] of the destination
3539 ///    vector.
3540 /// \param __i0
3541 ///    A 32-bit integer value used to initialize bits [31:0] of the destination
3542 ///    vector.
3543 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3544 ///    provided in the operands.
3545 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3546                                                            int __i1, int __i0) {
3547   return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3548 }
3549 
3550 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3551 ///    the specified 16-bit integer values.
3552 ///
3553 /// \headerfile <x86intrin.h>
3554 ///
3555 /// This intrinsic is a utility function and does not correspond to a specific
3556 ///    instruction.
3557 ///
3558 /// \param __w7
3559 ///    A 16-bit integer value used to initialize bits [127:112] of the
3560 ///    destination vector.
3561 /// \param __w6
3562 ///    A 16-bit integer value used to initialize bits [111:96] of the
3563 ///    destination vector.
3564 /// \param __w5
3565 ///    A 16-bit integer value used to initialize bits [95:80] of the destination
3566 ///    vector.
3567 /// \param __w4
3568 ///    A 16-bit integer value used to initialize bits [79:64] of the destination
3569 ///    vector.
3570 /// \param __w3
3571 ///    A 16-bit integer value used to initialize bits [63:48] of the destination
3572 ///    vector.
3573 /// \param __w2
3574 ///    A 16-bit integer value used to initialize bits [47:32] of the destination
3575 ///    vector.
3576 /// \param __w1
3577 ///    A 16-bit integer value used to initialize bits [31:16] of the destination
3578 ///    vector.
3579 /// \param __w0
3580 ///    A 16-bit integer value used to initialize bits [15:0] of the destination
3581 ///    vector.
3582 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3583 ///    provided in the operands.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS
3585 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3586               short __w2, short __w1, short __w0) {
3587   return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3588                                         __w4, __w5, __w6, __w7};
3589 }
3590 
3591 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3592 ///    the specified 8-bit integer values.
3593 ///
3594 /// \headerfile <x86intrin.h>
3595 ///
3596 /// This intrinsic is a utility function and does not correspond to a specific
3597 ///    instruction.
3598 ///
3599 /// \param __b15
3600 ///    Initializes bits [127:120] of the destination vector.
3601 /// \param __b14
3602 ///    Initializes bits [119:112] of the destination vector.
3603 /// \param __b13
3604 ///    Initializes bits [111:104] of the destination vector.
3605 /// \param __b12
3606 ///    Initializes bits [103:96] of the destination vector.
3607 /// \param __b11
3608 ///    Initializes bits [95:88] of the destination vector.
3609 /// \param __b10
3610 ///    Initializes bits [87:80] of the destination vector.
3611 /// \param __b9
3612 ///    Initializes bits [79:72] of the destination vector.
3613 /// \param __b8
3614 ///    Initializes bits [71:64] of the destination vector.
3615 /// \param __b7
3616 ///    Initializes bits [63:56] of the destination vector.
3617 /// \param __b6
3618 ///    Initializes bits [55:48] of the destination vector.
3619 /// \param __b5
3620 ///    Initializes bits [47:40] of the destination vector.
3621 /// \param __b4
3622 ///    Initializes bits [39:32] of the destination vector.
3623 /// \param __b3
3624 ///    Initializes bits [31:24] of the destination vector.
3625 /// \param __b2
3626 ///    Initializes bits [23:16] of the destination vector.
3627 /// \param __b1
3628 ///    Initializes bits [15:8] of the destination vector.
3629 /// \param __b0
3630 ///    Initializes bits [7:0] of the destination vector.
3631 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3632 ///    provided in the operands.
3633 static __inline__ __m128i __DEFAULT_FN_ATTRS
3634 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3635              char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3636              char __b4, char __b3, char __b2, char __b1, char __b0) {
3637   return __extension__(__m128i)(__v16qi){
3638       __b0, __b1, __b2,  __b3,  __b4,  __b5,  __b6,  __b7,
3639       __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3640 }
3641 
3642 /// Initializes both values in a 128-bit integer vector with the
3643 ///    specified 64-bit integer value.
3644 ///
3645 /// \headerfile <x86intrin.h>
3646 ///
3647 /// This intrinsic is a utility function and does not correspond to a specific
3648 ///    instruction.
3649 ///
3650 /// \param __q
3651 ///    Integer value used to initialize the elements of the destination integer
3652 ///    vector.
3653 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3654 ///    elements containing the value provided in the operand.
3655 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3656   return _mm_set_epi64x(__q, __q);
3657 }
3658 
3659 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3660 ///    specified 64-bit value.
3661 ///
3662 /// \headerfile <x86intrin.h>
3663 ///
3664 /// This intrinsic is a utility function and does not correspond to a specific
3665 ///    instruction.
3666 ///
3667 /// \param __q
3668 ///    A 64-bit value used to initialize the elements of the destination integer
3669 ///    vector.
3670 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3671 ///    containing the value provided in the operand.
3672 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3673   return _mm_set_epi64(__q, __q);
3674 }
3675 
3676 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3677 ///    specified 32-bit value.
3678 ///
3679 /// \headerfile <x86intrin.h>
3680 ///
3681 /// This intrinsic is a utility function and does not correspond to a specific
3682 ///    instruction.
3683 ///
3684 /// \param __i
3685 ///    A 32-bit value used to initialize the elements of the destination integer
3686 ///    vector.
3687 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3688 ///    containing the value provided in the operand.
3689 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3690   return _mm_set_epi32(__i, __i, __i, __i);
3691 }
3692 
3693 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3694 ///    specified 16-bit value.
3695 ///
3696 /// \headerfile <x86intrin.h>
3697 ///
3698 /// This intrinsic is a utility function and does not correspond to a specific
3699 ///    instruction.
3700 ///
3701 /// \param __w
3702 ///    A 16-bit value used to initialize the elements of the destination integer
3703 ///    vector.
3704 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3705 ///    containing the value provided in the operand.
3706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3707   return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3708 }
3709 
3710 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3711 ///    specified 8-bit value.
3712 ///
3713 /// \headerfile <x86intrin.h>
3714 ///
3715 /// This intrinsic is a utility function and does not correspond to a specific
3716 ///    instruction.
3717 ///
3718 /// \param __b
3719 ///    An 8-bit value used to initialize the elements of the destination integer
3720 ///    vector.
3721 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3722 ///    containing the value provided in the operand.
3723 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3724   return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3725                       __b, __b, __b, __b, __b);
3726 }
3727 
3728 /// Constructs a 128-bit integer vector, initialized in reverse order
3729 ///     with the specified 64-bit integral values.
3730 ///
3731 /// \headerfile <x86intrin.h>
3732 ///
3733 /// This intrinsic does not correspond to a specific instruction.
3734 ///
3735 /// \param __q0
3736 ///    A 64-bit integral value used to initialize the lower 64 bits of the
3737 ///    result.
3738 /// \param __q1
3739 ///    A 64-bit integral value used to initialize the upper 64 bits of the
3740 ///    result.
3741 /// \returns An initialized 128-bit integer vector.
3742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3743                                                             __m64 __q1) {
3744   return _mm_set_epi64(__q1, __q0);
3745 }
3746 
3747 /// Constructs a 128-bit integer vector, initialized in reverse order
3748 ///     with the specified 32-bit integral values.
3749 ///
3750 /// \headerfile <x86intrin.h>
3751 ///
3752 /// This intrinsic is a utility function and does not correspond to a specific
3753 ///    instruction.
3754 ///
3755 /// \param __i0
3756 ///    A 32-bit integral value used to initialize bits [31:0] of the result.
3757 /// \param __i1
3758 ///    A 32-bit integral value used to initialize bits [63:32] of the result.
3759 /// \param __i2
3760 ///    A 32-bit integral value used to initialize bits [95:64] of the result.
3761 /// \param __i3
3762 ///    A 32-bit integral value used to initialize bits [127:96] of the result.
3763 /// \returns An initialized 128-bit integer vector.
3764 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3765                                                             int __i2,
3766                                                             int __i3) {
3767   return _mm_set_epi32(__i3, __i2, __i1, __i0);
3768 }
3769 
3770 /// Constructs a 128-bit integer vector, initialized in reverse order
3771 ///     with the specified 16-bit integral values.
3772 ///
3773 /// \headerfile <x86intrin.h>
3774 ///
3775 /// This intrinsic is a utility function and does not correspond to a specific
3776 ///    instruction.
3777 ///
3778 /// \param __w0
3779 ///    A 16-bit integral value used to initialize bits [15:0] of the result.
3780 /// \param __w1
3781 ///    A 16-bit integral value used to initialize bits [31:16] of the result.
3782 /// \param __w2
3783 ///    A 16-bit integral value used to initialize bits [47:32] of the result.
3784 /// \param __w3
3785 ///    A 16-bit integral value used to initialize bits [63:48] of the result.
3786 /// \param __w4
3787 ///    A 16-bit integral value used to initialize bits [79:64] of the result.
3788 /// \param __w5
3789 ///    A 16-bit integral value used to initialize bits [95:80] of the result.
3790 /// \param __w6
3791 ///    A 16-bit integral value used to initialize bits [111:96] of the result.
3792 /// \param __w7
3793 ///    A 16-bit integral value used to initialize bits [127:112] of the result.
3794 /// \returns An initialized 128-bit integer vector.
3795 static __inline__ __m128i __DEFAULT_FN_ATTRS
3796 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3797                short __w5, short __w6, short __w7) {
3798   return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3799 }
3800 
3801 /// Constructs a 128-bit integer vector, initialized in reverse order
3802 ///     with the specified 8-bit integral values.
3803 ///
3804 /// \headerfile <x86intrin.h>
3805 ///
3806 /// This intrinsic is a utility function and does not correspond to a specific
3807 ///    instruction.
3808 ///
3809 /// \param __b0
3810 ///    An 8-bit integral value used to initialize bits [7:0] of the result.
3811 /// \param __b1
3812 ///    An 8-bit integral value used to initialize bits [15:8] of the result.
3813 /// \param __b2
3814 ///    An 8-bit integral value used to initialize bits [23:16] of the result.
3815 /// \param __b3
3816 ///    An 8-bit integral value used to initialize bits [31:24] of the result.
3817 /// \param __b4
3818 ///    An 8-bit integral value used to initialize bits [39:32] of the result.
3819 /// \param __b5
3820 ///    An 8-bit integral value used to initialize bits [47:40] of the result.
3821 /// \param __b6
3822 ///    An 8-bit integral value used to initialize bits [55:48] of the result.
3823 /// \param __b7
3824 ///    An 8-bit integral value used to initialize bits [63:56] of the result.
3825 /// \param __b8
3826 ///    An 8-bit integral value used to initialize bits [71:64] of the result.
3827 /// \param __b9
3828 ///    An 8-bit integral value used to initialize bits [79:72] of the result.
3829 /// \param __b10
3830 ///    An 8-bit integral value used to initialize bits [87:80] of the result.
3831 /// \param __b11
3832 ///    An 8-bit integral value used to initialize bits [95:88] of the result.
3833 /// \param __b12
3834 ///    An 8-bit integral value used to initialize bits [103:96] of the result.
3835 /// \param __b13
3836 ///    An 8-bit integral value used to initialize bits [111:104] of the result.
3837 /// \param __b14
3838 ///    An 8-bit integral value used to initialize bits [119:112] of the result.
3839 /// \param __b15
3840 ///    An 8-bit integral value used to initialize bits [127:120] of the result.
3841 /// \returns An initialized 128-bit integer vector.
3842 static __inline__ __m128i __DEFAULT_FN_ATTRS
3843 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3844               char __b6, char __b7, char __b8, char __b9, char __b10,
3845               char __b11, char __b12, char __b13, char __b14, char __b15) {
3846   return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3847                       __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3848 }
3849 
3850 /// Creates a 128-bit integer vector initialized to zero.
3851 ///
3852 /// \headerfile <x86intrin.h>
3853 ///
3854 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3855 ///
3856 /// \returns An initialized 128-bit integer vector with all elements set to
3857 ///    zero.
3858 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3859   return __extension__(__m128i)(__v2di){0LL, 0LL};
3860 }
3861 
3862 /// Stores a 128-bit integer vector to a memory location aligned on a
3863 ///    128-bit boundary.
3864 ///
3865 /// \headerfile <x86intrin.h>
3866 ///
3867 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3868 ///
3869 /// \param __p
3870 ///    A pointer to an aligned memory location that will receive the integer
3871 ///    values.
3872 /// \param __b
3873 ///    A 128-bit integer vector containing the values to be moved.
3874 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3875                                                           __m128i __b) {
3876   *__p = __b;
3877 }
3878 
3879 /// Stores a 128-bit integer vector to an unaligned memory location.
3880 ///
3881 /// \headerfile <x86intrin.h>
3882 ///
3883 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3884 ///
3885 /// \param __p
3886 ///    A pointer to a memory location that will receive the integer values.
3887 /// \param __b
3888 ///    A 128-bit integer vector containing the values to be moved.
3889 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3890                                                            __m128i __b) {
3891   struct __storeu_si128 {
3892     __m128i_u __v;
3893   } __attribute__((__packed__, __may_alias__));
3894   ((struct __storeu_si128 *)__p)->__v = __b;
3895 }
3896 
3897 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3898 ///    vector.
3899 ///
3900 /// \headerfile <x86intrin.h>
3901 ///
3902 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3903 ///
3904 /// \param __p
3905 ///    A pointer to a 64-bit memory location. The address of the memory
3906 ///    location does not have to be aligned.
3907 /// \param __b
3908 ///    A 128-bit integer vector containing the value to be stored.
3909 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3910                                                           __m128i __b) {
3911   struct __storeu_si64 {
3912     long long __v;
3913   } __attribute__((__packed__, __may_alias__));
3914   ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3915 }
3916 
3917 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3918 ///    vector.
3919 ///
3920 /// \headerfile <x86intrin.h>
3921 ///
3922 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3923 ///
3924 /// \param __p
3925 ///    A pointer to a 32-bit memory location. The address of the memory
3926 ///    location does not have to be aligned.
3927 /// \param __b
3928 ///    A 128-bit integer vector containing the value to be stored.
3929 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3930                                                           __m128i __b) {
3931   struct __storeu_si32 {
3932     int __v;
3933   } __attribute__((__packed__, __may_alias__));
3934   ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3935 }
3936 
3937 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3938 ///    vector.
3939 ///
3940 /// \headerfile <x86intrin.h>
3941 ///
3942 /// This intrinsic does not correspond to a specific instruction.
3943 ///
3944 /// \param __p
3945 ///    A pointer to a 16-bit memory location. The address of the memory
3946 ///    location does not have to be aligned.
3947 /// \param __b
3948 ///    A 128-bit integer vector containing the value to be stored.
3949 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3950                                                           __m128i __b) {
3951   struct __storeu_si16 {
3952     short __v;
3953   } __attribute__((__packed__, __may_alias__));
3954   ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3955 }
3956 
3957 /// Moves bytes selected by the mask from the first operand to the
3958 ///    specified unaligned memory location. When a mask bit is 1, the
3959 ///    corresponding byte is written, otherwise it is not written.
3960 ///
3961 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
3962 ///    used again soon). Exception and trap behavior for elements not selected
3963 ///    for storage to memory are implementation dependent.
3964 ///
3965 /// \headerfile <x86intrin.h>
3966 ///
3967 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3968 ///   instruction.
3969 ///
3970 /// \param __d
3971 ///    A 128-bit integer vector containing the values to be moved.
3972 /// \param __n
3973 ///    A 128-bit integer vector containing the mask. The most significant bit of
3974 ///    each byte represents the mask bits.
3975 /// \param __p
3976 ///    A pointer to an unaligned 128-bit memory location where the specified
3977 ///    values are moved.
3978 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3979                                                               __m128i __n,
3980                                                               char *__p) {
3981   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3982 }
3983 
3984 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3985 ///    a memory location.
3986 ///
3987 /// \headerfile <x86intrin.h>
3988 ///
3989 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3990 ///
3991 /// \param __p
3992 ///    A pointer to a 64-bit memory location that will receive the lower 64 bits
3993 ///    of the integer vector parameter.
3994 /// \param __a
3995 ///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3996 ///    value to be stored.
3997 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3998                                                            __m128i __a) {
3999   struct __mm_storel_epi64_struct {
4000     long long __u;
4001   } __attribute__((__packed__, __may_alias__));
4002   ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4003 }
4004 
4005 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4006 ///    aligned memory location.
4007 ///
4008 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4009 ///    used again soon).
4010 ///
4011 /// \headerfile <x86intrin.h>
4012 ///
4013 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4014 ///
4015 /// \param __p
4016 ///    A pointer to the 128-bit aligned memory location used to store the value.
4017 /// \param __a
4018 ///    A vector of [2 x double] containing the 64-bit values to be stored.
4019 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4020                                                         __m128d __a) {
4021   __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4022 }
4023 
4024 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4025 ///
4026 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4027 ///    used again soon).
4028 ///
4029 /// \headerfile <x86intrin.h>
4030 ///
4031 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4032 ///
4033 /// \param __p
4034 ///    A pointer to the 128-bit aligned memory location used to store the value.
4035 /// \param __a
4036 ///    A 128-bit integer vector containing the values to be stored.
4037 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4038                                                            __m128i __a) {
4039   __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4040 }
4041 
4042 /// Stores a 32-bit integer value in the specified memory location.
4043 ///
4044 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4045 ///    used again soon).
4046 ///
4047 /// \headerfile <x86intrin.h>
4048 ///
4049 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4050 ///
4051 /// \param __p
4052 ///    A pointer to the 32-bit memory location used to store the value.
4053 /// \param __a
4054 ///    A 32-bit integer containing the value to be stored.
4055 static __inline__ void
4056     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4057     _mm_stream_si32(void *__p, int __a) {
4058   __builtin_ia32_movnti((int *)__p, __a);
4059 }
4060 
4061 #ifdef __x86_64__
4062 /// Stores a 64-bit integer value in the specified memory location.
4063 ///
4064 ///    To minimize caching, the data is flagged as non-temporal (unlikely to be
4065 ///    used again soon).
4066 ///
4067 /// \headerfile <x86intrin.h>
4068 ///
4069 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4070 ///
4071 /// \param __p
4072 ///    A pointer to the 64-bit memory location used to store the value.
4073 /// \param __a
4074 ///    A 64-bit integer containing the value to be stored.
4075 static __inline__ void
4076     __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
4077     _mm_stream_si64(void *__p, long long __a) {
4078   __builtin_ia32_movnti64((long long *)__p, __a);
4079 }
4080 #endif
4081 
4082 #if defined(__cplusplus)
4083 extern "C" {
4084 #endif
4085 
4086 /// The cache line containing \a __p is flushed and invalidated from all
4087 ///    caches in the coherency domain.
4088 ///
4089 /// \headerfile <x86intrin.h>
4090 ///
4091 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4092 ///
4093 /// \param __p
4094 ///    A pointer to the memory location used to identify the cache line to be
4095 ///    flushed.
4096 void _mm_clflush(void const *__p);
4097 
4098 /// Forces strong memory ordering (serialization) between load
4099 ///    instructions preceding this instruction and load instructions following
4100 ///    this instruction, ensuring the system completes all previous loads before
4101 ///    executing subsequent loads.
4102 ///
4103 /// \headerfile <x86intrin.h>
4104 ///
4105 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4106 ///
4107 void _mm_lfence(void);
4108 
4109 /// Forces strong memory ordering (serialization) between load and store
4110 ///    instructions preceding this instruction and load and store instructions
4111 ///    following this instruction, ensuring that the system completes all
4112 ///    previous memory accesses before executing subsequent memory accesses.
4113 ///
4114 /// \headerfile <x86intrin.h>
4115 ///
4116 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4117 ///
4118 void _mm_mfence(void);
4119 
4120 #if defined(__cplusplus)
4121 } // extern "C"
4122 #endif
4123 
4124 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4125 ///    vector operands into 8-bit signed integers, and packs the results into
4126 ///    the destination.
4127 ///
4128 ///    Positive values greater than 0x7F are saturated to 0x7F. Negative values
4129 ///    less than 0x80 are saturated to 0x80.
4130 ///
4131 /// \headerfile <x86intrin.h>
4132 ///
4133 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4134 ///
4135 /// \param __a
4136 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4137 ///   written to the lower 64 bits of the result.
4138 /// \param __b
4139 ///   A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4140 ///   written to the higher 64 bits of the result.
4141 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4142 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4143                                                              __m128i __b) {
4144   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4145 }
4146 
4147 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4148 ///    vector operands into 16-bit signed integers, and packs the results into
4149 ///    the destination.
4150 ///
4151 ///    Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4152 ///    values less than 0x8000 are saturated to 0x8000.
4153 ///
4154 /// \headerfile <x86intrin.h>
4155 ///
4156 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4157 ///
4158 /// \param __a
4159 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4160 ///    are written to the lower 64 bits of the result.
4161 /// \param __b
4162 ///    A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4163 ///    are written to the higher 64 bits of the result.
4164 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
4165 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4166                                                              __m128i __b) {
4167   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4168 }
4169 
4170 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4171 ///    vector operands into 8-bit unsigned integers, and packs the results into
4172 ///    the destination.
4173 ///
4174 ///    Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4175 ///    are saturated to 0x00.
4176 ///
4177 /// \headerfile <x86intrin.h>
4178 ///
4179 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4180 ///
4181 /// \param __a
4182 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4183 ///    written to the lower 64 bits of the result.
4184 /// \param __b
4185 ///    A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4186 ///    written to the higher 64 bits of the result.
4187 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
4188 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4189                                                               __m128i __b) {
4190   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4191 }
4192 
4193 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4194 ///    the immediate-value parameter as a selector.
4195 ///
4196 /// \headerfile <x86intrin.h>
4197 ///
4198 /// \code
4199 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4200 /// \endcode
4201 ///
4202 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4203 ///
4204 /// \param a
4205 ///    A 128-bit integer vector.
4206 /// \param imm
4207 ///    An immediate value. Bits [2:0] selects values from \a a to be assigned
4208 ///    to bits[15:0] of the result. \n
4209 ///    000: assign values from bits [15:0] of \a a. \n
4210 ///    001: assign values from bits [31:16] of \a a. \n
4211 ///    010: assign values from bits [47:32] of \a a. \n
4212 ///    011: assign values from bits [63:48] of \a a. \n
4213 ///    100: assign values from bits [79:64] of \a a. \n
4214 ///    101: assign values from bits [95:80] of \a a. \n
4215 ///    110: assign values from bits [111:96] of \a a. \n
4216 ///    111: assign values from bits [127:112] of \a a.
4217 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4218 ///    integer vector parameter and the remaining bits are assigned zeros.
4219 #define _mm_extract_epi16(a, imm)                                              \
4220   ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
4221                                                     (int)(imm)))
4222 
4223 /// Constructs a 128-bit integer vector by first making a copy of the
4224 ///    128-bit integer vector parameter, and then inserting the lower 16 bits
4225 ///    of an integer parameter into an offset specified by the immediate-value
4226 ///    parameter.
4227 ///
4228 /// \headerfile <x86intrin.h>
4229 ///
4230 /// \code
4231 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4232 /// \endcode
4233 ///
4234 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4235 ///
4236 /// \param a
4237 ///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
4238 ///    result and then one of the eight elements in the result is replaced by
4239 ///    the lower 16 bits of \a b.
4240 /// \param b
4241 ///    An integer. The lower 16 bits of this parameter are written to the
4242 ///    result beginning at an offset specified by \a imm.
4243 /// \param imm
4244 ///    An immediate value specifying the bit offset in the result at which the
4245 ///    lower 16 bits of \a b are written.
4246 /// \returns A 128-bit integer vector containing the constructed values.
4247 #define _mm_insert_epi16(a, b, imm)                                            \
4248   ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
4249                                         (int)(imm)))
4250 
4251 /// Copies the values of the most significant bits from each 8-bit
4252 ///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4253 ///    value, zero-extends the value, and writes it to the destination.
4254 ///
4255 /// \headerfile <x86intrin.h>
4256 ///
4257 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4258 ///
4259 /// \param __a
4260 ///    A 128-bit integer vector containing the values with bits to be extracted.
4261 /// \returns The most significant bits from each 8-bit element in \a __a,
4262 ///    written to bits [15:0]. The other bits are assigned zeros.
4263 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4264   return __builtin_ia32_pmovmskb128((__v16qi)__a);
4265 }
4266 
4267 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4268 ///    elements of a 128-bit integer vector parameter, using the immediate-value
4269 ///    parameter as a specifier.
4270 ///
4271 /// \headerfile <x86intrin.h>
4272 ///
4273 /// \code
4274 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4275 /// \endcode
4276 ///
4277 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4278 ///
4279 /// \param a
4280 ///    A 128-bit integer vector containing the values to be copied.
4281 /// \param imm
4282 ///    An immediate value containing an 8-bit value specifying which elements to
4283 ///    copy from a. The destinations within the 128-bit destination are assigned
4284 ///    values as follows: \n
4285 ///    Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4286 ///    Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4287 ///    Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4288 ///    Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4289 ///    Bit value assignments: \n
4290 ///    00: assign values from bits [31:0] of \a a. \n
4291 ///    01: assign values from bits [63:32] of \a a. \n
4292 ///    10: assign values from bits [95:64] of \a a. \n
4293 ///    11: assign values from bits [127:96] of \a a. \n
4294 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4295 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4296 ///    <c>[b6, b4, b2, b0]</c>.
4297 /// \returns A 128-bit integer vector containing the shuffled values.
4298 #define _mm_shuffle_epi32(a, imm)                                              \
4299   ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4300 
4301 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4302 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4303 ///    value parameter as a specifier.
4304 ///
4305 /// \headerfile <x86intrin.h>
4306 ///
4307 /// \code
4308 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4309 /// \endcode
4310 ///
4311 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4312 ///
4313 /// \param a
4314 ///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4315 ///    [127:64] of the result.
4316 /// \param imm
4317 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4318 ///    Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4319 ///    Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4320 ///    Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4321 ///    Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4322 ///    Bit value assignments: \n
4323 ///    00: assign values from bits [15:0] of \a a. \n
4324 ///    01: assign values from bits [31:16] of \a a. \n
4325 ///    10: assign values from bits [47:32] of \a a. \n
4326 ///    11: assign values from bits [63:48] of \a a. \n
4327 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4328 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4329 ///    <c>[b6, b4, b2, b0]</c>.
4330 /// \returns A 128-bit integer vector containing the shuffled values.
4331 #define _mm_shufflelo_epi16(a, imm)                                            \
4332   ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4333 
4334 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4335 ///    elements of a 128-bit integer vector of [8 x i16], using the immediate
4336 ///    value parameter as a specifier.
4337 ///
4338 /// \headerfile <x86intrin.h>
4339 ///
4340 /// \code
4341 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4342 /// \endcode
4343 ///
4344 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4345 ///
4346 /// \param a
4347 ///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4348 ///    [63:0] of the result.
4349 /// \param imm
4350 ///    An 8-bit immediate value specifying which elements to copy from \a a. \n
4351 ///    Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4352 ///    Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4353 ///    Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4354 ///    Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4355 ///    Bit value assignments: \n
4356 ///    00: assign values from bits [79:64] of \a a. \n
4357 ///    01: assign values from bits [95:80] of \a a. \n
4358 ///    10: assign values from bits [111:96] of \a a. \n
4359 ///    11: assign values from bits [127:112] of \a a. \n
4360 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4361 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4362 ///    <c>[b6, b4, b2, b0]</c>.
4363 /// \returns A 128-bit integer vector containing the shuffled values.
4364 #define _mm_shufflehi_epi16(a, imm)                                            \
4365   ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4366 
4367 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4368 ///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4369 ///
4370 /// \headerfile <x86intrin.h>
4371 ///
4372 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4373 ///   instruction.
4374 ///
4375 /// \param __a
4376 ///    A 128-bit vector of [16 x i8].
4377 ///    Bits [71:64] are written to bits [7:0] of the result. \n
4378 ///    Bits [79:72] are written to bits [23:16] of the result. \n
4379 ///    Bits [87:80] are written to bits [39:32] of the result. \n
4380 ///    Bits [95:88] are written to bits [55:48] of the result. \n
4381 ///    Bits [103:96] are written to bits [71:64] of the result. \n
4382 ///    Bits [111:104] are written to bits [87:80] of the result. \n
4383 ///    Bits [119:112] are written to bits [103:96] of the result. \n
4384 ///    Bits [127:120] are written to bits [119:112] of the result.
4385 /// \param __b
4386 ///    A 128-bit vector of [16 x i8]. \n
4387 ///    Bits [71:64] are written to bits [15:8] of the result. \n
4388 ///    Bits [79:72] are written to bits [31:24] of the result. \n
4389 ///    Bits [87:80] are written to bits [47:40] of the result. \n
4390 ///    Bits [95:88] are written to bits [63:56] of the result. \n
4391 ///    Bits [103:96] are written to bits [79:72] of the result. \n
4392 ///    Bits [111:104] are written to bits [95:88] of the result. \n
4393 ///    Bits [119:112] are written to bits [111:104] of the result. \n
4394 ///    Bits [127:120] are written to bits [127:120] of the result.
4395 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4397                                                                __m128i __b) {
4398   return (__m128i)__builtin_shufflevector(
4399       (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4400       16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4401 }
4402 
4403 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4404 ///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4405 ///
4406 /// \headerfile <x86intrin.h>
4407 ///
4408 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4409 ///   instruction.
4410 ///
4411 /// \param __a
4412 ///    A 128-bit vector of [8 x i16].
4413 ///    Bits [79:64] are written to bits [15:0] of the result. \n
4414 ///    Bits [95:80] are written to bits [47:32] of the result. \n
4415 ///    Bits [111:96] are written to bits [79:64] of the result. \n
4416 ///    Bits [127:112] are written to bits [111:96] of the result.
4417 /// \param __b
4418 ///    A 128-bit vector of [8 x i16].
4419 ///    Bits [79:64] are written to bits [31:16] of the result. \n
4420 ///    Bits [95:80] are written to bits [63:48] of the result. \n
4421 ///    Bits [111:96] are written to bits [95:80] of the result. \n
4422 ///    Bits [127:112] are written to bits [127:112] of the result.
4423 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4424 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4425                                                                 __m128i __b) {
4426   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4427                                           8 + 5, 6, 8 + 6, 7, 8 + 7);
4428 }
4429 
4430 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4431 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4432 ///
4433 /// \headerfile <x86intrin.h>
4434 ///
4435 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4436 ///   instruction.
4437 ///
4438 /// \param __a
4439 ///    A 128-bit vector of [4 x i32]. \n
4440 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
4441 ///    Bits [127:96] are written to bits [95:64] of the destination.
4442 /// \param __b
4443 ///    A 128-bit vector of [4 x i32]. \n
4444 ///    Bits [95:64] are written to bits [64:32] of the destination. \n
4445 ///    Bits [127:96] are written to bits [127:96] of the destination.
4446 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4448                                                                 __m128i __b) {
4449   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4450                                           4 + 3);
4451 }
4452 
4453 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4454 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4455 ///
4456 /// \headerfile <x86intrin.h>
4457 ///
4458 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4459 ///   instruction.
4460 ///
4461 /// \param __a
4462 ///    A 128-bit vector of [2 x i64]. \n
4463 ///    Bits [127:64] are written to bits [63:0] of the destination.
4464 /// \param __b
4465 ///    A 128-bit vector of [2 x i64]. \n
4466 ///    Bits [127:64] are written to bits [127:64] of the destination.
4467 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4468 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4469                                                                 __m128i __b) {
4470   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4471 }
4472 
4473 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4474 ///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4475 ///
4476 /// \headerfile <x86intrin.h>
4477 ///
4478 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4479 ///   instruction.
4480 ///
4481 /// \param __a
4482 ///    A 128-bit vector of [16 x i8]. \n
4483 ///    Bits [7:0] are written to bits [7:0] of the result. \n
4484 ///    Bits [15:8] are written to bits [23:16] of the result. \n
4485 ///    Bits [23:16] are written to bits [39:32] of the result. \n
4486 ///    Bits [31:24] are written to bits [55:48] of the result. \n
4487 ///    Bits [39:32] are written to bits [71:64] of the result. \n
4488 ///    Bits [47:40] are written to bits [87:80] of the result. \n
4489 ///    Bits [55:48] are written to bits [103:96] of the result. \n
4490 ///    Bits [63:56] are written to bits [119:112] of the result.
4491 /// \param __b
4492 ///    A 128-bit vector of [16 x i8].
4493 ///    Bits [7:0] are written to bits [15:8] of the result. \n
4494 ///    Bits [15:8] are written to bits [31:24] of the result. \n
4495 ///    Bits [23:16] are written to bits [47:40] of the result. \n
4496 ///    Bits [31:24] are written to bits [63:56] of the result. \n
4497 ///    Bits [39:32] are written to bits [79:72] of the result. \n
4498 ///    Bits [47:40] are written to bits [95:88] of the result. \n
4499 ///    Bits [55:48] are written to bits [111:104] of the result. \n
4500 ///    Bits [63:56] are written to bits [127:120] of the result.
4501 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
4502 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4503                                                                __m128i __b) {
4504   return (__m128i)__builtin_shufflevector(
4505       (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4506       16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4507 }
4508 
4509 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4510 ///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
4511 ///    [8 x i16].
4512 ///
4513 /// \headerfile <x86intrin.h>
4514 ///
4515 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4516 ///   instruction.
4517 ///
4518 /// \param __a
4519 ///    A 128-bit vector of [8 x i16].
4520 ///    Bits [15:0] are written to bits [15:0] of the result. \n
4521 ///    Bits [31:16] are written to bits [47:32] of the result. \n
4522 ///    Bits [47:32] are written to bits [79:64] of the result. \n
4523 ///    Bits [63:48] are written to bits [111:96] of the result.
4524 /// \param __b
4525 ///    A 128-bit vector of [8 x i16].
4526 ///    Bits [15:0] are written to bits [31:16] of the result. \n
4527 ///    Bits [31:16] are written to bits [63:48] of the result. \n
4528 ///    Bits [47:32] are written to bits [95:80] of the result. \n
4529 ///    Bits [63:48] are written to bits [127:112] of the result.
4530 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
4531 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4532                                                                 __m128i __b) {
4533   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4534                                           8 + 1, 2, 8 + 2, 3, 8 + 3);
4535 }
4536 
4537 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4538 ///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4539 ///
4540 /// \headerfile <x86intrin.h>
4541 ///
4542 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4543 ///   instruction.
4544 ///
4545 /// \param __a
4546 ///    A 128-bit vector of [4 x i32]. \n
4547 ///    Bits [31:0] are written to bits [31:0] of the destination. \n
4548 ///    Bits [63:32] are written to bits [95:64] of the destination.
4549 /// \param __b
4550 ///    A 128-bit vector of [4 x i32]. \n
4551 ///    Bits [31:0] are written to bits [64:32] of the destination. \n
4552 ///    Bits [63:32] are written to bits [127:96] of the destination.
4553 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
4554 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4555                                                                 __m128i __b) {
4556   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4557                                           4 + 1);
4558 }
4559 
4560 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4561 ///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4562 ///
4563 /// \headerfile <x86intrin.h>
4564 ///
4565 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4566 ///   instruction.
4567 ///
4568 /// \param __a
4569 ///    A 128-bit vector of [2 x i64]. \n
4570 ///    Bits [63:0] are written to bits [63:0] of the destination. \n
4571 /// \param __b
4572 ///    A 128-bit vector of [2 x i64]. \n
4573 ///    Bits [63:0] are written to bits [127:64] of the destination. \n
4574 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
4575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4576                                                                 __m128i __b) {
4577   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4578 }
4579 
4580 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4581 ///    integer.
4582 ///
4583 /// \headerfile <x86intrin.h>
4584 ///
4585 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4586 ///
4587 /// \param __a
4588 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4589 ///    destination.
4590 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
4591 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4592   return (__m64)__a[0];
4593 }
4594 
4595 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4596 ///    upper bits.
4597 ///
4598 /// \headerfile <x86intrin.h>
4599 ///
4600 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4601 ///
4602 /// \param __a
4603 ///    A 64-bit value.
4604 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4605 ///    the operand. The upper 64 bits are assigned zeros.
4606 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4607   return __extension__(__m128i)(__v2di){(long long)__a, 0};
4608 }
4609 
4610 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4611 ///    integer vector, zeroing the upper bits.
4612 ///
4613 /// \headerfile <x86intrin.h>
4614 ///
4615 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4616 ///
4617 /// \param __a
4618 ///    A 128-bit integer vector operand. The lower 64 bits are moved to the
4619 ///    destination.
4620 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4621 ///    the operand. The upper 64 bits are assigned zeros.
4622 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4623   return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4624 }
4625 
4626 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4627 ///    [2 x double] and interleaves them into a 128-bit vector of [2 x
4628 ///    double].
4629 ///
4630 /// \headerfile <x86intrin.h>
4631 ///
4632 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4633 ///
4634 /// \param __a
4635 ///    A 128-bit vector of [2 x double]. \n
4636 ///    Bits [127:64] are written to bits [63:0] of the destination.
4637 /// \param __b
4638 ///    A 128-bit vector of [2 x double]. \n
4639 ///    Bits [127:64] are written to bits [127:64] of the destination.
4640 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4641 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4642                                                              __m128d __b) {
4643   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4644 }
4645 
4646 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4647 ///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
4648 ///    double].
4649 ///
4650 /// \headerfile <x86intrin.h>
4651 ///
4652 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4653 ///
4654 /// \param __a
4655 ///    A 128-bit vector of [2 x double]. \n
4656 ///    Bits [63:0] are written to bits [63:0] of the destination.
4657 /// \param __b
4658 ///    A 128-bit vector of [2 x double]. \n
4659 ///    Bits [63:0] are written to bits [127:64] of the destination.
4660 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
4661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4662                                                              __m128d __b) {
4663   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4664 }
4665 
4666 /// Extracts the sign bits of the double-precision values in the 128-bit
4667 ///    vector of [2 x double], zero-extends the value, and writes it to the
4668 ///    low-order bits of the destination.
4669 ///
4670 /// \headerfile <x86intrin.h>
4671 ///
4672 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4673 ///
4674 /// \param __a
4675 ///    A 128-bit vector of [2 x double] containing the values with sign bits to
4676 ///    be extracted.
4677 /// \returns The sign bits from each of the double-precision elements in \a __a,
4678 ///    written to bits [1:0]. The remaining bits are assigned values of zero.
4679 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4680   return __builtin_ia32_movmskpd((__v2df)__a);
4681 }
4682 
4683 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4684 ///    128-bit vector parameters of [2 x double], using the immediate-value
4685 ///     parameter as a specifier.
4686 ///
4687 /// \headerfile <x86intrin.h>
4688 ///
4689 /// \code
4690 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4691 /// \endcode
4692 ///
4693 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4694 ///
4695 /// \param a
4696 ///    A 128-bit vector of [2 x double].
4697 /// \param b
4698 ///    A 128-bit vector of [2 x double].
4699 /// \param i
4700 ///    An 8-bit immediate value. The least significant two bits specify which
4701 ///    elements to copy from \a a and \a b: \n
4702 ///    Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4703 ///    Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4704 ///    Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4705 ///    Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4706 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4707 ///    <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4708 ///    <c>[b1, b0]</c>.
4709 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4710 #define _mm_shuffle_pd(a, b, i)                                                \
4711   ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
4712                                   (int)(i)))
4713 
4714 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4715 ///    floating-point vector of [4 x float].
4716 ///
4717 /// \headerfile <x86intrin.h>
4718 ///
4719 /// This intrinsic has no corresponding instruction.
4720 ///
4721 /// \param __a
4722 ///    A 128-bit floating-point vector of [2 x double].
4723 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4724 ///    bitwise pattern as the parameter.
4725 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4726   return (__m128)__a;
4727 }
4728 
4729 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4730 ///    integer vector.
4731 ///
4732 /// \headerfile <x86intrin.h>
4733 ///
4734 /// This intrinsic has no corresponding instruction.
4735 ///
4736 /// \param __a
4737 ///    A 128-bit floating-point vector of [2 x double].
4738 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4739 ///    parameter.
4740 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4741   return (__m128i)__a;
4742 }
4743 
4744 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4745 ///    floating-point vector of [2 x double].
4746 ///
4747 /// \headerfile <x86intrin.h>
4748 ///
4749 /// This intrinsic has no corresponding instruction.
4750 ///
4751 /// \param __a
4752 ///    A 128-bit floating-point vector of [4 x float].
4753 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4754 ///    bitwise pattern as the parameter.
4755 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4756   return (__m128d)__a;
4757 }
4758 
4759 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4760 ///    integer vector.
4761 ///
4762 /// \headerfile <x86intrin.h>
4763 ///
4764 /// This intrinsic has no corresponding instruction.
4765 ///
4766 /// \param __a
4767 ///    A 128-bit floating-point vector of [4 x float].
4768 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4769 ///    parameter.
4770 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4771   return (__m128i)__a;
4772 }
4773 
4774 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4775 ///    of [4 x float].
4776 ///
4777 /// \headerfile <x86intrin.h>
4778 ///
4779 /// This intrinsic has no corresponding instruction.
4780 ///
4781 /// \param __a
4782 ///    A 128-bit integer vector.
4783 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4784 ///    bitwise pattern as the parameter.
4785 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4786   return (__m128)__a;
4787 }
4788 
4789 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4790 ///    of [2 x double].
4791 ///
4792 /// \headerfile <x86intrin.h>
4793 ///
4794 /// This intrinsic has no corresponding instruction.
4795 ///
4796 /// \param __a
4797 ///    A 128-bit integer vector.
4798 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4799 ///    bitwise pattern as the parameter.
4800 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4801   return (__m128d)__a;
4802 }
4803 
4804 /// Compares each of the corresponding double-precision values of two
4805 ///    128-bit vectors of [2 x double], using the operation specified by the
4806 ///    immediate integer operand.
4807 ///
4808 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4809 ///    If either value in a comparison is NaN, comparisons that are ordered
4810 ///    return false, and comparisons that are unordered return true.
4811 ///
4812 /// \headerfile <x86intrin.h>
4813 ///
4814 /// \code
4815 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4816 /// \endcode
4817 ///
4818 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4819 ///
4820 /// \param a
4821 ///    A 128-bit vector of [2 x double].
4822 /// \param b
4823 ///    A 128-bit vector of [2 x double].
4824 /// \param c
4825 ///    An immediate integer operand, with bits [4:0] specifying which comparison
4826 ///    operation to use: \n
4827 ///    0x00: Equal (ordered, non-signaling) \n
4828 ///    0x01: Less-than (ordered, signaling) \n
4829 ///    0x02: Less-than-or-equal (ordered, signaling) \n
4830 ///    0x03: Unordered (non-signaling) \n
4831 ///    0x04: Not-equal (unordered, non-signaling) \n
4832 ///    0x05: Not-less-than (unordered, signaling) \n
4833 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4834 ///    0x07: Ordered (non-signaling) \n
4835 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4836 #define _mm_cmp_pd(a, b, c)                                                    \
4837   ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4838                                  (c)))
4839 
4840 /// Compares each of the corresponding scalar double-precision values of
4841 ///    two 128-bit vectors of [2 x double], using the operation specified by the
4842 ///    immediate integer operand.
4843 ///
4844 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4845 ///    If either value in a comparison is NaN, comparisons that are ordered
4846 ///    return false, and comparisons that are unordered return true.
4847 ///
4848 /// \headerfile <x86intrin.h>
4849 ///
4850 /// \code
4851 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4852 /// \endcode
4853 ///
4854 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4855 ///
4856 /// \param a
4857 ///    A 128-bit vector of [2 x double].
4858 /// \param b
4859 ///    A 128-bit vector of [2 x double].
4860 /// \param c
4861 ///    An immediate integer operand, with bits [4:0] specifying which comparison
4862 ///    operation to use: \n
4863 ///    0x00: Equal (ordered, non-signaling) \n
4864 ///    0x01: Less-than (ordered, signaling) \n
4865 ///    0x02: Less-than-or-equal (ordered, signaling) \n
4866 ///    0x03: Unordered (non-signaling) \n
4867 ///    0x04: Not-equal (unordered, non-signaling) \n
4868 ///    0x05: Not-less-than (unordered, signaling) \n
4869 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
4870 ///    0x07: Ordered (non-signaling) \n
4871 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4872 #define _mm_cmp_sd(a, b, c)                                                    \
4873   ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),   \
4874                                  (c)))
4875 
4876 #if defined(__cplusplus)
4877 extern "C" {
4878 #endif
4879 
4880 /// Indicates that a spin loop is being executed for the purposes of
4881 ///    optimizing power consumption during the loop.
4882 ///
4883 /// \headerfile <x86intrin.h>
4884 ///
4885 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4886 ///
4887 void _mm_pause(void);
4888 
4889 #if defined(__cplusplus)
4890 } // extern "C"
4891 #endif
4892 #undef __DEFAULT_FN_ATTRS
4893 #undef __DEFAULT_FN_ATTRS_MMX
4894 
4895 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4896 
4897 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4898 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4899 
4900 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4901 
4902 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4903 #define _MM_SET_DENORMALS_ZERO_MODE(x)                                         \
4904   (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4905 
4906 #endif /* __EMMINTRIN_H */
4907