xref: /freebsd/contrib/llvm-project/clang/lib/Headers/xmmintrin.h (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #include <mmintrin.h>
14 
15 typedef int __v4si __attribute__((__vector_size__(16)));
16 typedef float __v4sf __attribute__((__vector_size__(16)));
17 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
18 
19 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
20 
21 /* Unsigned types */
22 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
23 
24 /* This header should only be included in a hosted environment as it depends on
25  * a standard library to provide allocation routines. */
26 #if __STDC_HOSTED__
27 #include <mm_malloc.h>
28 #endif
29 
30 /* Define the default attributes for the functions in this file. */
31 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
32 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
33 
34 /// Adds the 32-bit float values in the low-order bits of the operands.
35 ///
36 /// \headerfile <x86intrin.h>
37 ///
38 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
39 ///
40 /// \param __a
41 ///    A 128-bit vector of [4 x float] containing one of the source operands.
42 ///    The lower 32 bits of this operand are used in the calculation.
43 /// \param __b
44 ///    A 128-bit vector of [4 x float] containing one of the source operands.
45 ///    The lower 32 bits of this operand are used in the calculation.
46 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
47 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
48 ///    the upper 96 bits of the first source operand.
49 static __inline__ __m128 __DEFAULT_FN_ATTRS
50 _mm_add_ss(__m128 __a, __m128 __b)
51 {
52   __a[0] += __b[0];
53   return __a;
54 }
55 
56 /// Adds two 128-bit vectors of [4 x float], and returns the results of
57 ///    the addition.
58 ///
59 /// \headerfile <x86intrin.h>
60 ///
61 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
62 ///
63 /// \param __a
64 ///    A 128-bit vector of [4 x float] containing one of the source operands.
65 /// \param __b
66 ///    A 128-bit vector of [4 x float] containing one of the source operands.
67 /// \returns A 128-bit vector of [4 x float] containing the sums of both
68 ///    operands.
69 static __inline__ __m128 __DEFAULT_FN_ATTRS
70 _mm_add_ps(__m128 __a, __m128 __b)
71 {
72   return (__m128)((__v4sf)__a + (__v4sf)__b);
73 }
74 
75 /// Subtracts the 32-bit float value in the low-order bits of the second
76 ///    operand from the corresponding value in the first operand.
77 ///
78 /// \headerfile <x86intrin.h>
79 ///
80 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
81 ///
82 /// \param __a
83 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
84 ///    of this operand are used in the calculation.
85 /// \param __b
86 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
87 ///    bits of this operand are used in the calculation.
88 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
89 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
90 ///    copied from the upper 96 bits of the first source operand.
91 static __inline__ __m128 __DEFAULT_FN_ATTRS
92 _mm_sub_ss(__m128 __a, __m128 __b)
93 {
94   __a[0] -= __b[0];
95   return __a;
96 }
97 
98 /// Subtracts each of the values of the second operand from the first
99 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
100 ///    the results of the subtraction.
101 ///
102 /// \headerfile <x86intrin.h>
103 ///
104 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
105 ///
106 /// \param __a
107 ///    A 128-bit vector of [4 x float] containing the minuend.
108 /// \param __b
109 ///    A 128-bit vector of [4 x float] containing the subtrahend.
110 /// \returns A 128-bit vector of [4 x float] containing the differences between
111 ///    both operands.
112 static __inline__ __m128 __DEFAULT_FN_ATTRS
113 _mm_sub_ps(__m128 __a, __m128 __b)
114 {
115   return (__m128)((__v4sf)__a - (__v4sf)__b);
116 }
117 
118 /// Multiplies two 32-bit float values in the low-order bits of the
119 ///    operands.
120 ///
121 /// \headerfile <x86intrin.h>
122 ///
123 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
124 ///
125 /// \param __a
126 ///    A 128-bit vector of [4 x float] containing one of the source operands.
127 ///    The lower 32 bits of this operand are used in the calculation.
128 /// \param __b
129 ///    A 128-bit vector of [4 x float] containing one of the source operands.
130 ///    The lower 32 bits of this operand are used in the calculation.
131 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
132 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
133 ///    bits of the first source operand.
134 static __inline__ __m128 __DEFAULT_FN_ATTRS
135 _mm_mul_ss(__m128 __a, __m128 __b)
136 {
137   __a[0] *= __b[0];
138   return __a;
139 }
140 
141 /// Multiplies two 128-bit vectors of [4 x float] and returns the
142 ///    results of the multiplication.
143 ///
144 /// \headerfile <x86intrin.h>
145 ///
146 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
147 ///
148 /// \param __a
149 ///    A 128-bit vector of [4 x float] containing one of the source operands.
150 /// \param __b
151 ///    A 128-bit vector of [4 x float] containing one of the source operands.
152 /// \returns A 128-bit vector of [4 x float] containing the products of both
153 ///    operands.
154 static __inline__ __m128 __DEFAULT_FN_ATTRS
155 _mm_mul_ps(__m128 __a, __m128 __b)
156 {
157   return (__m128)((__v4sf)__a * (__v4sf)__b);
158 }
159 
160 /// Divides the value in the low-order 32 bits of the first operand by
161 ///    the corresponding value in the second operand.
162 ///
163 /// \headerfile <x86intrin.h>
164 ///
165 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
166 ///
167 /// \param __a
168 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
169 ///    bits of this operand are used in the calculation.
170 /// \param __b
171 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
172 ///    of this operand are used in the calculation.
173 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
174 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
175 ///    upper 96 bits of the first source operand.
176 static __inline__ __m128 __DEFAULT_FN_ATTRS
177 _mm_div_ss(__m128 __a, __m128 __b)
178 {
179   __a[0] /= __b[0];
180   return __a;
181 }
182 
183 /// Divides two 128-bit vectors of [4 x float].
184 ///
185 /// \headerfile <x86intrin.h>
186 ///
187 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
188 ///
189 /// \param __a
190 ///    A 128-bit vector of [4 x float] containing the dividend.
191 /// \param __b
192 ///    A 128-bit vector of [4 x float] containing the divisor.
193 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
194 ///    operands.
195 static __inline__ __m128 __DEFAULT_FN_ATTRS
196 _mm_div_ps(__m128 __a, __m128 __b)
197 {
198   return (__m128)((__v4sf)__a / (__v4sf)__b);
199 }
200 
201 /// Calculates the square root of the value stored in the low-order bits
202 ///    of a 128-bit vector of [4 x float].
203 ///
204 /// \headerfile <x86intrin.h>
205 ///
206 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
207 ///
208 /// \param __a
209 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
210 ///    used in the calculation.
211 /// \returns A 128-bit vector of [4 x float] containing the square root of the
212 ///    value in the low-order bits of the operand.
213 static __inline__ __m128 __DEFAULT_FN_ATTRS
214 _mm_sqrt_ss(__m128 __a)
215 {
216   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
217 }
218 
219 /// Calculates the square roots of the values stored in a 128-bit vector
220 ///    of [4 x float].
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
225 ///
226 /// \param __a
227 ///    A 128-bit vector of [4 x float].
228 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
229 ///    values in the operand.
230 static __inline__ __m128 __DEFAULT_FN_ATTRS
231 _mm_sqrt_ps(__m128 __a)
232 {
233   return __builtin_ia32_sqrtps((__v4sf)__a);
234 }
235 
236 /// Calculates the approximate reciprocal of the value stored in the
237 ///    low-order bits of a 128-bit vector of [4 x float].
238 ///
239 /// \headerfile <x86intrin.h>
240 ///
241 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
242 ///
243 /// \param __a
244 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
245 ///    used in the calculation.
246 /// \returns A 128-bit vector of [4 x float] containing the approximate
247 ///    reciprocal of the value in the low-order bits of the operand.
248 static __inline__ __m128 __DEFAULT_FN_ATTRS
249 _mm_rcp_ss(__m128 __a)
250 {
251   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
252 }
253 
254 /// Calculates the approximate reciprocals of the values stored in a
255 ///    128-bit vector of [4 x float].
256 ///
257 /// \headerfile <x86intrin.h>
258 ///
259 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
260 ///
261 /// \param __a
262 ///    A 128-bit vector of [4 x float].
263 /// \returns A 128-bit vector of [4 x float] containing the approximate
264 ///    reciprocals of the values in the operand.
265 static __inline__ __m128 __DEFAULT_FN_ATTRS
266 _mm_rcp_ps(__m128 __a)
267 {
268   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
269 }
270 
271 /// Calculates the approximate reciprocal of the square root of the value
272 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
273 ///
274 /// \headerfile <x86intrin.h>
275 ///
276 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
277 ///
278 /// \param __a
279 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
280 ///    used in the calculation.
281 /// \returns A 128-bit vector of [4 x float] containing the approximate
282 ///    reciprocal of the square root of the value in the low-order bits of the
283 ///    operand.
284 static __inline__ __m128 __DEFAULT_FN_ATTRS
285 _mm_rsqrt_ss(__m128 __a)
286 {
287   return __builtin_ia32_rsqrtss((__v4sf)__a);
288 }
289 
290 /// Calculates the approximate reciprocals of the square roots of the
291 ///    values stored in a 128-bit vector of [4 x float].
292 ///
293 /// \headerfile <x86intrin.h>
294 ///
295 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
296 ///
297 /// \param __a
298 ///    A 128-bit vector of [4 x float].
299 /// \returns A 128-bit vector of [4 x float] containing the approximate
300 ///    reciprocals of the square roots of the values in the operand.
301 static __inline__ __m128 __DEFAULT_FN_ATTRS
302 _mm_rsqrt_ps(__m128 __a)
303 {
304   return __builtin_ia32_rsqrtps((__v4sf)__a);
305 }
306 
307 /// Compares two 32-bit float values in the low-order bits of both
308 ///    operands and returns the lesser value in the low-order bits of the
309 ///    vector of [4 x float].
310 ///
311 /// \headerfile <x86intrin.h>
312 ///
313 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
314 ///
315 /// \param __a
316 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
317 ///    32 bits of this operand are used in the comparison.
318 /// \param __b
319 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
320 ///    32 bits of this operand are used in the comparison.
321 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
322 ///    minimum value between both operands. The upper 96 bits are copied from
323 ///    the upper 96 bits of the first source operand.
324 static __inline__ __m128 __DEFAULT_FN_ATTRS
325 _mm_min_ss(__m128 __a, __m128 __b)
326 {
327   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
328 }
329 
330 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
331 ///    of each pair of values.
332 ///
333 /// \headerfile <x86intrin.h>
334 ///
335 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
336 ///
337 /// \param __a
338 ///    A 128-bit vector of [4 x float] containing one of the operands.
339 /// \param __b
340 ///    A 128-bit vector of [4 x float] containing one of the operands.
341 /// \returns A 128-bit vector of [4 x float] containing the minimum values
342 ///    between both operands.
343 static __inline__ __m128 __DEFAULT_FN_ATTRS
344 _mm_min_ps(__m128 __a, __m128 __b)
345 {
346   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
347 }
348 
349 /// Compares two 32-bit float values in the low-order bits of both
350 ///    operands and returns the greater value in the low-order bits of a 128-bit
351 ///    vector of [4 x float].
352 ///
353 /// \headerfile <x86intrin.h>
354 ///
355 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
356 ///
357 /// \param __a
358 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
359 ///    32 bits of this operand are used in the comparison.
360 /// \param __b
361 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
362 ///    32 bits of this operand are used in the comparison.
363 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
364 ///    maximum value between both operands. The upper 96 bits are copied from
365 ///    the upper 96 bits of the first source operand.
366 static __inline__ __m128 __DEFAULT_FN_ATTRS
367 _mm_max_ss(__m128 __a, __m128 __b)
368 {
369   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
370 }
371 
372 /// Compares two 128-bit vectors of [4 x float] and returns the greater
373 ///    of each pair of values.
374 ///
375 /// \headerfile <x86intrin.h>
376 ///
377 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
378 ///
379 /// \param __a
380 ///    A 128-bit vector of [4 x float] containing one of the operands.
381 /// \param __b
382 ///    A 128-bit vector of [4 x float] containing one of the operands.
383 /// \returns A 128-bit vector of [4 x float] containing the maximum values
384 ///    between both operands.
385 static __inline__ __m128 __DEFAULT_FN_ATTRS
386 _mm_max_ps(__m128 __a, __m128 __b)
387 {
388   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
389 }
390 
391 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
392 ///
393 /// \headerfile <x86intrin.h>
394 ///
395 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
396 ///
397 /// \param __a
398 ///    A 128-bit vector containing one of the source operands.
399 /// \param __b
400 ///    A 128-bit vector containing one of the source operands.
401 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
402 ///    values between both operands.
403 static __inline__ __m128 __DEFAULT_FN_ATTRS
404 _mm_and_ps(__m128 __a, __m128 __b)
405 {
406   return (__m128)((__v4su)__a & (__v4su)__b);
407 }
408 
409 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
410 ///    the one's complement of the values contained in the first source
411 ///    operand.
412 ///
413 /// \headerfile <x86intrin.h>
414 ///
415 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
416 ///
417 /// \param __a
418 ///    A 128-bit vector of [4 x float] containing the first source operand. The
419 ///    one's complement of this value is used in the bitwise AND.
420 /// \param __b
421 ///    A 128-bit vector of [4 x float] containing the second source operand.
422 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
423 ///    one's complement of the first operand and the values in the second
424 ///    operand.
425 static __inline__ __m128 __DEFAULT_FN_ATTRS
426 _mm_andnot_ps(__m128 __a, __m128 __b)
427 {
428   return (__m128)(~(__v4su)__a & (__v4su)__b);
429 }
430 
431 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
432 ///
433 /// \headerfile <x86intrin.h>
434 ///
435 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
436 ///
437 /// \param __a
438 ///    A 128-bit vector of [4 x float] containing one of the source operands.
439 /// \param __b
440 ///    A 128-bit vector of [4 x float] containing one of the source operands.
441 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
442 ///    values between both operands.
443 static __inline__ __m128 __DEFAULT_FN_ATTRS
444 _mm_or_ps(__m128 __a, __m128 __b)
445 {
446   return (__m128)((__v4su)__a | (__v4su)__b);
447 }
448 
449 /// Performs a bitwise exclusive OR of two 128-bit vectors of
450 ///    [4 x float].
451 ///
452 /// \headerfile <x86intrin.h>
453 ///
454 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
455 ///
456 /// \param __a
457 ///    A 128-bit vector of [4 x float] containing one of the source operands.
458 /// \param __b
459 ///    A 128-bit vector of [4 x float] containing one of the source operands.
460 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
461 ///    of the values between both operands.
462 static __inline__ __m128 __DEFAULT_FN_ATTRS
463 _mm_xor_ps(__m128 __a, __m128 __b)
464 {
465   return (__m128)((__v4su)__a ^ (__v4su)__b);
466 }
467 
468 /// Compares two 32-bit float values in the low-order bits of both
469 ///    operands for equality and returns the result of the comparison in the
470 ///    low-order bits of a vector [4 x float].
471 ///
472 /// \headerfile <x86intrin.h>
473 ///
474 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
475 ///
476 /// \param __a
477 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
478 ///    32 bits of this operand are used in the comparison.
479 /// \param __b
480 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
481 ///    32 bits of this operand are used in the comparison.
482 /// \returns A 128-bit vector of [4 x float] containing the comparison results
483 ///    in the low-order bits.
484 static __inline__ __m128 __DEFAULT_FN_ATTRS
485 _mm_cmpeq_ss(__m128 __a, __m128 __b)
486 {
487   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
488 }
489 
490 /// Compares each of the corresponding 32-bit float values of the
491 ///    128-bit vectors of [4 x float] for equality.
492 ///
493 /// \headerfile <x86intrin.h>
494 ///
495 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
496 ///
497 /// \param __a
498 ///    A 128-bit vector of [4 x float].
499 /// \param __b
500 ///    A 128-bit vector of [4 x float].
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
502 static __inline__ __m128 __DEFAULT_FN_ATTRS
503 _mm_cmpeq_ps(__m128 __a, __m128 __b)
504 {
505   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
506 }
507 
508 /// Compares two 32-bit float values in the low-order bits of both
509 ///    operands to determine if the value in the first operand is less than the
510 ///    corresponding value in the second operand and returns the result of the
511 ///    comparison in the low-order bits of a vector of [4 x float].
512 ///
513 /// \headerfile <x86intrin.h>
514 ///
515 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
516 ///
517 /// \param __a
518 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
519 ///    32 bits of this operand are used in the comparison.
520 /// \param __b
521 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
522 ///    32 bits of this operand are used in the comparison.
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results
524 ///    in the low-order bits.
525 static __inline__ __m128 __DEFAULT_FN_ATTRS
526 _mm_cmplt_ss(__m128 __a, __m128 __b)
527 {
528   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
529 }
530 
531 /// Compares each of the corresponding 32-bit float values of the
532 ///    128-bit vectors of [4 x float] to determine if the values in the first
533 ///    operand are less than those in the second operand.
534 ///
535 /// \headerfile <x86intrin.h>
536 ///
537 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
538 ///
539 /// \param __a
540 ///    A 128-bit vector of [4 x float].
541 /// \param __b
542 ///    A 128-bit vector of [4 x float].
543 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
544 static __inline__ __m128 __DEFAULT_FN_ATTRS
545 _mm_cmplt_ps(__m128 __a, __m128 __b)
546 {
547   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
548 }
549 
550 /// Compares two 32-bit float values in the low-order bits of both
551 ///    operands to determine if the value in the first operand is less than or
552 ///    equal to the corresponding value in the second operand and returns the
553 ///    result of the comparison in the low-order bits of a vector of
554 ///    [4 x float].
555 ///
556 /// \headerfile <x86intrin.h>
557 ///
558 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
559 ///
560 /// \param __a
561 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
562 ///    32 bits of this operand are used in the comparison.
563 /// \param __b
564 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
565 ///    32 bits of this operand are used in the comparison.
566 /// \returns A 128-bit vector of [4 x float] containing the comparison results
567 ///    in the low-order bits.
568 static __inline__ __m128 __DEFAULT_FN_ATTRS
569 _mm_cmple_ss(__m128 __a, __m128 __b)
570 {
571   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
572 }
573 
574 /// Compares each of the corresponding 32-bit float values of the
575 ///    128-bit vectors of [4 x float] to determine if the values in the first
576 ///    operand are less than or equal to those in the second operand.
577 ///
578 /// \headerfile <x86intrin.h>
579 ///
580 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
581 ///
582 /// \param __a
583 ///    A 128-bit vector of [4 x float].
584 /// \param __b
585 ///    A 128-bit vector of [4 x float].
586 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
587 static __inline__ __m128 __DEFAULT_FN_ATTRS
588 _mm_cmple_ps(__m128 __a, __m128 __b)
589 {
590   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
591 }
592 
593 /// Compares two 32-bit float values in the low-order bits of both
594 ///    operands to determine if the value in the first operand is greater than
595 ///    the corresponding value in the second operand and returns the result of
596 ///    the comparison in the low-order bits of a vector of [4 x float].
597 ///
598 /// \headerfile <x86intrin.h>
599 ///
600 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
601 ///
602 /// \param __a
603 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
604 ///    32 bits of this operand are used in the comparison.
605 /// \param __b
606 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
607 ///    32 bits of this operand are used in the comparison.
608 /// \returns A 128-bit vector of [4 x float] containing the comparison results
609 ///    in the low-order bits.
610 static __inline__ __m128 __DEFAULT_FN_ATTRS
611 _mm_cmpgt_ss(__m128 __a, __m128 __b)
612 {
613   return (__m128)__builtin_shufflevector((__v4sf)__a,
614                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
615                                          4, 1, 2, 3);
616 }
617 
618 /// Compares each of the corresponding 32-bit float values of the
619 ///    128-bit vectors of [4 x float] to determine if the values in the first
620 ///    operand are greater than those in the second operand.
621 ///
622 /// \headerfile <x86intrin.h>
623 ///
624 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
625 ///
626 /// \param __a
627 ///    A 128-bit vector of [4 x float].
628 /// \param __b
629 ///    A 128-bit vector of [4 x float].
630 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
631 static __inline__ __m128 __DEFAULT_FN_ATTRS
632 _mm_cmpgt_ps(__m128 __a, __m128 __b)
633 {
634   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
635 }
636 
637 /// Compares two 32-bit float values in the low-order bits of both
638 ///    operands to determine if the value in the first operand is greater than
639 ///    or equal to the corresponding value in the second operand and returns
640 ///    the result of the comparison in the low-order bits of a vector of
641 ///    [4 x float].
642 ///
643 /// \headerfile <x86intrin.h>
644 ///
645 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
646 ///
647 /// \param __a
648 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
649 ///    32 bits of this operand are used in the comparison.
650 /// \param __b
651 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
652 ///    32 bits of this operand are used in the comparison.
653 /// \returns A 128-bit vector of [4 x float] containing the comparison results
654 ///    in the low-order bits.
655 static __inline__ __m128 __DEFAULT_FN_ATTRS
656 _mm_cmpge_ss(__m128 __a, __m128 __b)
657 {
658   return (__m128)__builtin_shufflevector((__v4sf)__a,
659                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
660                                          4, 1, 2, 3);
661 }
662 
663 /// Compares each of the corresponding 32-bit float values of the
664 ///    128-bit vectors of [4 x float] to determine if the values in the first
665 ///    operand are greater than or equal to those in the second operand.
666 ///
667 /// \headerfile <x86intrin.h>
668 ///
669 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
670 ///
671 /// \param __a
672 ///    A 128-bit vector of [4 x float].
673 /// \param __b
674 ///    A 128-bit vector of [4 x float].
675 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
676 static __inline__ __m128 __DEFAULT_FN_ATTRS
677 _mm_cmpge_ps(__m128 __a, __m128 __b)
678 {
679   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
680 }
681 
682 /// Compares two 32-bit float values in the low-order bits of both
683 ///    operands for inequality and returns the result of the comparison in the
684 ///    low-order bits of a vector of [4 x float].
685 ///
686 /// \headerfile <x86intrin.h>
687 ///
688 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
689 ///   instructions.
690 ///
691 /// \param __a
692 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
693 ///    32 bits of this operand are used in the comparison.
694 /// \param __b
695 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
696 ///    32 bits of this operand are used in the comparison.
697 /// \returns A 128-bit vector of [4 x float] containing the comparison results
698 ///    in the low-order bits.
699 static __inline__ __m128 __DEFAULT_FN_ATTRS
700 _mm_cmpneq_ss(__m128 __a, __m128 __b)
701 {
702   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
703 }
704 
705 /// Compares each of the corresponding 32-bit float values of the
706 ///    128-bit vectors of [4 x float] for inequality.
707 ///
708 /// \headerfile <x86intrin.h>
709 ///
710 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
711 ///   instructions.
712 ///
713 /// \param __a
714 ///    A 128-bit vector of [4 x float].
715 /// \param __b
716 ///    A 128-bit vector of [4 x float].
717 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
718 static __inline__ __m128 __DEFAULT_FN_ATTRS
719 _mm_cmpneq_ps(__m128 __a, __m128 __b)
720 {
721   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
722 }
723 
724 /// Compares two 32-bit float values in the low-order bits of both
725 ///    operands to determine if the value in the first operand is not less than
726 ///    the corresponding value in the second operand and returns the result of
727 ///    the comparison in the low-order bits of a vector of [4 x float].
728 ///
729 /// \headerfile <x86intrin.h>
730 ///
731 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
732 ///   instructions.
733 ///
734 /// \param __a
735 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
736 ///    32 bits of this operand are used in the comparison.
737 /// \param __b
738 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
739 ///    32 bits of this operand are used in the comparison.
740 /// \returns A 128-bit vector of [4 x float] containing the comparison results
741 ///    in the low-order bits.
742 static __inline__ __m128 __DEFAULT_FN_ATTRS
743 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
744 {
745   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
746 }
747 
748 /// Compares each of the corresponding 32-bit float values of the
749 ///    128-bit vectors of [4 x float] to determine if the values in the first
750 ///    operand are not less than those in the second operand.
751 ///
752 /// \headerfile <x86intrin.h>
753 ///
754 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
755 ///   instructions.
756 ///
757 /// \param __a
758 ///    A 128-bit vector of [4 x float].
759 /// \param __b
760 ///    A 128-bit vector of [4 x float].
761 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
762 static __inline__ __m128 __DEFAULT_FN_ATTRS
763 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
764 {
765   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
766 }
767 
768 /// Compares two 32-bit float values in the low-order bits of both
769 ///    operands to determine if the value in the first operand is not less than
770 ///    or equal to the corresponding value in the second operand and returns
771 ///    the result of the comparison in the low-order bits of a vector of
772 ///    [4 x float].
773 ///
774 /// \headerfile <x86intrin.h>
775 ///
776 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
777 ///   instructions.
778 ///
779 /// \param __a
780 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
781 ///    32 bits of this operand are used in the comparison.
782 /// \param __b
783 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
784 ///    32 bits of this operand are used in the comparison.
785 /// \returns A 128-bit vector of [4 x float] containing the comparison results
786 ///    in the low-order bits.
787 static __inline__ __m128 __DEFAULT_FN_ATTRS
788 _mm_cmpnle_ss(__m128 __a, __m128 __b)
789 {
790   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
791 }
792 
793 /// Compares each of the corresponding 32-bit float values of the
794 ///    128-bit vectors of [4 x float] to determine if the values in the first
795 ///    operand are not less than or equal to those in the second operand.
796 ///
797 /// \headerfile <x86intrin.h>
798 ///
799 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
800 ///   instructions.
801 ///
802 /// \param __a
803 ///    A 128-bit vector of [4 x float].
804 /// \param __b
805 ///    A 128-bit vector of [4 x float].
806 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
807 static __inline__ __m128 __DEFAULT_FN_ATTRS
808 _mm_cmpnle_ps(__m128 __a, __m128 __b)
809 {
810   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
811 }
812 
813 /// Compares two 32-bit float values in the low-order bits of both
814 ///    operands to determine if the value in the first operand is not greater
815 ///    than the corresponding value in the second operand and returns the
816 ///    result of the comparison in the low-order bits of a vector of
817 ///    [4 x float].
818 ///
819 /// \headerfile <x86intrin.h>
820 ///
821 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
822 ///   instructions.
823 ///
824 /// \param __a
825 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
826 ///    32 bits of this operand are used in the comparison.
827 /// \param __b
828 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
829 ///    32 bits of this operand are used in the comparison.
830 /// \returns A 128-bit vector of [4 x float] containing the comparison results
831 ///    in the low-order bits.
832 static __inline__ __m128 __DEFAULT_FN_ATTRS
833 _mm_cmpngt_ss(__m128 __a, __m128 __b)
834 {
835   return (__m128)__builtin_shufflevector((__v4sf)__a,
836                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
837                                          4, 1, 2, 3);
838 }
839 
840 /// Compares each of the corresponding 32-bit float values of the
841 ///    128-bit vectors of [4 x float] to determine if the values in the first
842 ///    operand are not greater than those in the second operand.
843 ///
844 /// \headerfile <x86intrin.h>
845 ///
846 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
847 ///   instructions.
848 ///
849 /// \param __a
850 ///    A 128-bit vector of [4 x float].
851 /// \param __b
852 ///    A 128-bit vector of [4 x float].
853 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
854 static __inline__ __m128 __DEFAULT_FN_ATTRS
855 _mm_cmpngt_ps(__m128 __a, __m128 __b)
856 {
857   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
858 }
859 
860 /// Compares two 32-bit float values in the low-order bits of both
861 ///    operands to determine if the value in the first operand is not greater
862 ///    than or equal to the corresponding value in the second operand and
863 ///    returns the result of the comparison in the low-order bits of a vector
864 ///    of [4 x float].
865 ///
866 /// \headerfile <x86intrin.h>
867 ///
868 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
869 ///   instructions.
870 ///
871 /// \param __a
872 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
873 ///    32 bits of this operand are used in the comparison.
874 /// \param __b
875 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
876 ///    32 bits of this operand are used in the comparison.
877 /// \returns A 128-bit vector of [4 x float] containing the comparison results
878 ///    in the low-order bits.
879 static __inline__ __m128 __DEFAULT_FN_ATTRS
880 _mm_cmpnge_ss(__m128 __a, __m128 __b)
881 {
882   return (__m128)__builtin_shufflevector((__v4sf)__a,
883                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
884                                          4, 1, 2, 3);
885 }
886 
887 /// Compares each of the corresponding 32-bit float values of the
888 ///    128-bit vectors of [4 x float] to determine if the values in the first
889 ///    operand are not greater than or equal to those in the second operand.
890 ///
891 /// \headerfile <x86intrin.h>
892 ///
893 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
894 ///   instructions.
895 ///
896 /// \param __a
897 ///    A 128-bit vector of [4 x float].
898 /// \param __b
899 ///    A 128-bit vector of [4 x float].
900 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
901 static __inline__ __m128 __DEFAULT_FN_ATTRS
902 _mm_cmpnge_ps(__m128 __a, __m128 __b)
903 {
904   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
905 }
906 
907 /// Compares two 32-bit float values in the low-order bits of both
908 ///    operands to determine if the value in the first operand is ordered with
909 ///    respect to the corresponding value in the second operand and returns the
910 ///    result of the comparison in the low-order bits of a vector of
911 ///    [4 x float].
912 ///
913 /// \headerfile <x86intrin.h>
914 ///
915 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
916 ///   instructions.
917 ///
918 /// \param __a
919 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
920 ///    32 bits of this operand are used in the comparison.
921 /// \param __b
922 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
923 ///    32 bits of this operand are used in the comparison.
924 /// \returns A 128-bit vector of [4 x float] containing the comparison results
925 ///    in the low-order bits.
926 static __inline__ __m128 __DEFAULT_FN_ATTRS
927 _mm_cmpord_ss(__m128 __a, __m128 __b)
928 {
929   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
930 }
931 
932 /// Compares each of the corresponding 32-bit float values of the
933 ///    128-bit vectors of [4 x float] to determine if the values in the first
934 ///    operand are ordered with respect to those in the second operand.
935 ///
936 /// \headerfile <x86intrin.h>
937 ///
938 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
939 ///   instructions.
940 ///
941 /// \param __a
942 ///    A 128-bit vector of [4 x float].
943 /// \param __b
944 ///    A 128-bit vector of [4 x float].
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
946 static __inline__ __m128 __DEFAULT_FN_ATTRS
947 _mm_cmpord_ps(__m128 __a, __m128 __b)
948 {
949   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
950 }
951 
952 /// Compares two 32-bit float values in the low-order bits of both
953 ///    operands to determine if the value in the first operand is unordered
954 ///    with respect to the corresponding value in the second operand and
955 ///    returns the result of the comparison in the low-order bits of a vector
956 ///    of [4 x float].
957 ///
958 /// \headerfile <x86intrin.h>
959 ///
960 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
961 ///   instructions.
962 ///
963 /// \param __a
964 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
965 ///    32 bits of this operand are used in the comparison.
966 /// \param __b
967 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
968 ///    32 bits of this operand are used in the comparison.
969 /// \returns A 128-bit vector of [4 x float] containing the comparison results
970 ///    in the low-order bits.
971 static __inline__ __m128 __DEFAULT_FN_ATTRS
972 _mm_cmpunord_ss(__m128 __a, __m128 __b)
973 {
974   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
975 }
976 
977 /// Compares each of the corresponding 32-bit float values of the
978 ///    128-bit vectors of [4 x float] to determine if the values in the first
979 ///    operand are unordered with respect to those in the second operand.
980 ///
981 /// \headerfile <x86intrin.h>
982 ///
983 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
984 ///   instructions.
985 ///
986 /// \param __a
987 ///    A 128-bit vector of [4 x float].
988 /// \param __b
989 ///    A 128-bit vector of [4 x float].
990 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
991 static __inline__ __m128 __DEFAULT_FN_ATTRS
992 _mm_cmpunord_ps(__m128 __a, __m128 __b)
993 {
994   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
995 }
996 
997 /// Compares two 32-bit float values in the low-order bits of both
998 ///    operands for equality and returns the result of the comparison.
999 ///
1000 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1001 ///
1002 /// \headerfile <x86intrin.h>
1003 ///
1004 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1005 ///   instructions.
1006 ///
1007 /// \param __a
1008 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1009 ///    used in the comparison.
1010 /// \param __b
1011 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1012 ///    used in the comparison.
1013 /// \returns An integer containing the comparison results. If either of the
1014 ///    two lower 32-bit values is NaN, 0 is returned.
1015 static __inline__ int __DEFAULT_FN_ATTRS
1016 _mm_comieq_ss(__m128 __a, __m128 __b)
1017 {
1018   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1019 }
1020 
1021 /// Compares two 32-bit float values in the low-order bits of both
1022 ///    operands to determine if the first operand is less than the second
1023 ///    operand and returns the result of the comparison.
1024 ///
1025 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1026 ///
1027 /// \headerfile <x86intrin.h>
1028 ///
1029 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1030 ///   instructions.
1031 ///
1032 /// \param __a
1033 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1034 ///    used in the comparison.
1035 /// \param __b
1036 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1037 ///    used in the comparison.
1038 /// \returns An integer containing the comparison results. If either of the two
1039 ///     lower 32-bit values is NaN, 0 is returned.
1040 static __inline__ int __DEFAULT_FN_ATTRS
1041 _mm_comilt_ss(__m128 __a, __m128 __b)
1042 {
1043   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1044 }
1045 
1046 /// Compares two 32-bit float values in the low-order bits of both
1047 ///    operands to determine if the first operand is less than or equal to the
1048 ///    second operand and returns the result of the comparison.
1049 ///
1050 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1051 ///
1052 /// \headerfile <x86intrin.h>
1053 ///
1054 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1055 ///
1056 /// \param __a
1057 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1058 ///    used in the comparison.
1059 /// \param __b
1060 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1061 ///    used in the comparison.
1062 /// \returns An integer containing the comparison results. If either of the two
1063 ///     lower 32-bit values is NaN, 0 is returned.
1064 static __inline__ int __DEFAULT_FN_ATTRS
1065 _mm_comile_ss(__m128 __a, __m128 __b)
1066 {
1067   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1068 }
1069 
1070 /// Compares two 32-bit float values in the low-order bits of both
1071 ///    operands to determine if the first operand is greater than the second
1072 ///    operand and returns the result of the comparison.
1073 ///
1074 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1075 ///
1076 /// \headerfile <x86intrin.h>
1077 ///
1078 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1079 ///
1080 /// \param __a
1081 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1082 ///    used in the comparison.
1083 /// \param __b
1084 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1085 ///    used in the comparison.
1086 /// \returns An integer containing the comparison results. If either of the
1087 ///     two lower 32-bit values is NaN, 0 is returned.
1088 static __inline__ int __DEFAULT_FN_ATTRS
1089 _mm_comigt_ss(__m128 __a, __m128 __b)
1090 {
1091   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1092 }
1093 
1094 /// Compares two 32-bit float values in the low-order bits of both
1095 ///    operands to determine if the first operand is greater than or equal to
1096 ///    the second operand and returns the result of the comparison.
1097 ///
1098 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1099 ///
1100 /// \headerfile <x86intrin.h>
1101 ///
1102 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1103 ///
1104 /// \param __a
1105 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1106 ///    used in the comparison.
1107 /// \param __b
1108 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 ///    used in the comparison.
1110 /// \returns An integer containing the comparison results. If either of the two
1111 ///    lower 32-bit values is NaN, 0 is returned.
1112 static __inline__ int __DEFAULT_FN_ATTRS
1113 _mm_comige_ss(__m128 __a, __m128 __b)
1114 {
1115   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1116 }
1117 
1118 /// Compares two 32-bit float values in the low-order bits of both
1119 ///    operands to determine if the first operand is not equal to the second
1120 ///    operand and returns the result of the comparison.
1121 ///
1122 ///    If either of the two lower 32-bit values is NaN, 1 is returned.
1123 ///
1124 /// \headerfile <x86intrin.h>
1125 ///
1126 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1127 ///
1128 /// \param __a
1129 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1130 ///    used in the comparison.
1131 /// \param __b
1132 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1133 ///    used in the comparison.
1134 /// \returns An integer containing the comparison results. If either of the
1135 ///     two lower 32-bit values is NaN, 1 is returned.
1136 static __inline__ int __DEFAULT_FN_ATTRS
1137 _mm_comineq_ss(__m128 __a, __m128 __b)
1138 {
1139   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1140 }
1141 
1142 /// Performs an unordered comparison of two 32-bit float values using
1143 ///    the low-order bits of both operands to determine equality and returns
1144 ///    the result of the comparison.
1145 ///
1146 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1147 ///
1148 /// \headerfile <x86intrin.h>
1149 ///
1150 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1151 ///
1152 /// \param __a
1153 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1154 ///    used in the comparison.
1155 /// \param __b
1156 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1157 ///    used in the comparison.
1158 /// \returns An integer containing the comparison results. If either of the two
1159 ///     lower 32-bit values is NaN, 0 is returned.
1160 static __inline__ int __DEFAULT_FN_ATTRS
1161 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1162 {
1163   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1164 }
1165 
1166 /// Performs an unordered comparison of two 32-bit float values using
1167 ///    the low-order bits of both operands to determine if the first operand is
1168 ///    less than the second operand and returns the result of the comparison.
1169 ///
1170 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1171 ///
1172 /// \headerfile <x86intrin.h>
1173 ///
1174 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1175 ///
1176 /// \param __a
1177 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1178 ///    used in the comparison.
1179 /// \param __b
1180 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1181 ///    used in the comparison.
1182 /// \returns An integer containing the comparison results. If either of the two
1183 ///    lower 32-bit values is NaN, 0 is returned.
1184 static __inline__ int __DEFAULT_FN_ATTRS
1185 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1186 {
1187   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1188 }
1189 
1190 /// Performs an unordered comparison of two 32-bit float values using
1191 ///    the low-order bits of both operands to determine if the first operand is
1192 ///    less than or equal to the second operand and returns the result of the
1193 ///    comparison.
1194 ///
1195 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1196 ///
1197 /// \headerfile <x86intrin.h>
1198 ///
1199 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1200 ///
1201 /// \param __a
1202 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1203 ///    used in the comparison.
1204 /// \param __b
1205 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 ///    used in the comparison.
1207 /// \returns An integer containing the comparison results. If either of the two
1208 ///     lower 32-bit values is NaN, 0 is returned.
1209 static __inline__ int __DEFAULT_FN_ATTRS
1210 _mm_ucomile_ss(__m128 __a, __m128 __b)
1211 {
1212   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1213 }
1214 
1215 /// Performs an unordered comparison of two 32-bit float values using
1216 ///    the low-order bits of both operands to determine if the first operand is
1217 ///    greater than the second operand and returns the result of the
1218 ///    comparison.
1219 ///
1220 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1221 ///
1222 /// \headerfile <x86intrin.h>
1223 ///
1224 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1225 ///
1226 /// \param __a
1227 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1228 ///    used in the comparison.
1229 /// \param __b
1230 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1231 ///    used in the comparison.
1232 /// \returns An integer containing the comparison results. If either of the two
1233 ///     lower 32-bit values is NaN, 0 is returned.
1234 static __inline__ int __DEFAULT_FN_ATTRS
1235 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1236 {
1237   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1238 }
1239 
1240 /// Performs an unordered comparison of two 32-bit float values using
1241 ///    the low-order bits of both operands to determine if the first operand is
1242 ///    greater than or equal to the second operand and returns the result of
1243 ///    the comparison.
1244 ///
1245 ///    If either of the two lower 32-bit values is NaN, 0 is returned.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1250 ///
1251 /// \param __a
1252 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253 ///    used in the comparison.
1254 /// \param __b
1255 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 ///    used in the comparison.
1257 /// \returns An integer containing the comparison results. If either of the two
1258 ///     lower 32-bit values is NaN, 0 is returned.
1259 static __inline__ int __DEFAULT_FN_ATTRS
1260 _mm_ucomige_ss(__m128 __a, __m128 __b)
1261 {
1262   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1263 }
1264 
1265 /// Performs an unordered comparison of two 32-bit float values using
1266 ///    the low-order bits of both operands to determine inequality and returns
1267 ///    the result of the comparison.
1268 ///
1269 ///    If either of the two lower 32-bit values is NaN, 1 is returned.
1270 ///
1271 /// \headerfile <x86intrin.h>
1272 ///
1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1274 ///
1275 /// \param __a
1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 ///    used in the comparison.
1278 /// \param __b
1279 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280 ///    used in the comparison.
1281 /// \returns An integer containing the comparison results. If either of the two
1282 ///    lower 32-bit values is NaN, 1 is returned.
1283 static __inline__ int __DEFAULT_FN_ATTRS
1284 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1285 {
1286   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1287 }
1288 
1289 /// Converts a float value contained in the lower 32 bits of a vector of
1290 ///    [4 x float] into a 32-bit integer.
1291 ///
1292 /// \headerfile <x86intrin.h>
1293 ///
1294 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1295 ///   instructions.
1296 ///
1297 /// \param __a
1298 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1299 ///    used in the conversion.
1300 /// \returns A 32-bit integer containing the converted value.
1301 static __inline__ int __DEFAULT_FN_ATTRS
1302 _mm_cvtss_si32(__m128 __a)
1303 {
1304   return __builtin_ia32_cvtss2si((__v4sf)__a);
1305 }
1306 
1307 /// Converts a float value contained in the lower 32 bits of a vector of
1308 ///    [4 x float] into a 32-bit integer.
1309 ///
1310 /// \headerfile <x86intrin.h>
1311 ///
1312 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1313 ///   instructions.
1314 ///
1315 /// \param __a
1316 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1317 ///    used in the conversion.
1318 /// \returns A 32-bit integer containing the converted value.
1319 static __inline__ int __DEFAULT_FN_ATTRS
1320 _mm_cvt_ss2si(__m128 __a)
1321 {
1322   return _mm_cvtss_si32(__a);
1323 }
1324 
1325 #ifdef __x86_64__
1326 
1327 /// Converts a float value contained in the lower 32 bits of a vector of
1328 ///    [4 x float] into a 64-bit integer.
1329 ///
1330 /// \headerfile <x86intrin.h>
1331 ///
1332 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1333 ///   instructions.
1334 ///
1335 /// \param __a
1336 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1337 ///    used in the conversion.
1338 /// \returns A 64-bit integer containing the converted value.
1339 static __inline__ long long __DEFAULT_FN_ATTRS
1340 _mm_cvtss_si64(__m128 __a)
1341 {
1342   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1343 }
1344 
1345 #endif
1346 
1347 /// Converts two low-order float values in a 128-bit vector of
1348 ///    [4 x float] into a 64-bit vector of [2 x i32].
1349 ///
1350 /// \headerfile <x86intrin.h>
1351 ///
1352 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1353 ///
1354 /// \param __a
1355 ///    A 128-bit vector of [4 x float].
1356 /// \returns A 64-bit integer vector containing the converted values.
1357 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1358 _mm_cvtps_pi32(__m128 __a)
1359 {
1360   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1361 }
1362 
1363 /// Converts two low-order float values in a 128-bit vector of
1364 ///    [4 x float] into a 64-bit vector of [2 x i32].
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1369 ///
1370 /// \param __a
1371 ///    A 128-bit vector of [4 x float].
1372 /// \returns A 64-bit integer vector containing the converted values.
1373 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1374 _mm_cvt_ps2pi(__m128 __a)
1375 {
1376   return _mm_cvtps_pi32(__a);
1377 }
1378 
1379 /// Converts a float value contained in the lower 32 bits of a vector of
1380 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1381 ///    inexact.
1382 ///
1383 /// \headerfile <x86intrin.h>
1384 ///
1385 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1386 ///   instructions.
1387 ///
1388 /// \param __a
1389 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1390 ///    used in the conversion.
1391 /// \returns A 32-bit integer containing the converted value.
1392 static __inline__ int __DEFAULT_FN_ATTRS
1393 _mm_cvttss_si32(__m128 __a)
1394 {
1395   return __builtin_ia32_cvttss2si((__v4sf)__a);
1396 }
1397 
1398 /// Converts a float value contained in the lower 32 bits of a vector of
1399 ///    [4 x float] into a 32-bit integer, truncating the result when it is
1400 ///    inexact.
1401 ///
1402 /// \headerfile <x86intrin.h>
1403 ///
1404 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1405 ///   instructions.
1406 ///
1407 /// \param __a
1408 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1409 ///    used in the conversion.
1410 /// \returns A 32-bit integer containing the converted value.
1411 static __inline__ int __DEFAULT_FN_ATTRS
1412 _mm_cvtt_ss2si(__m128 __a)
1413 {
1414   return _mm_cvttss_si32(__a);
1415 }
1416 
1417 #ifdef __x86_64__
1418 /// Converts a float value contained in the lower 32 bits of a vector of
1419 ///    [4 x float] into a 64-bit integer, truncating the result when it is
1420 ///    inexact.
1421 ///
1422 /// \headerfile <x86intrin.h>
1423 ///
1424 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1425 ///   instructions.
1426 ///
1427 /// \param __a
1428 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1429 ///    used in the conversion.
1430 /// \returns A 64-bit integer containing the converted value.
1431 static __inline__ long long __DEFAULT_FN_ATTRS
1432 _mm_cvttss_si64(__m128 __a)
1433 {
1434   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1435 }
1436 #endif
1437 
1438 /// Converts two low-order float values in a 128-bit vector of
1439 ///    [4 x float] into a 64-bit vector of [2 x i32], truncating the result
1440 ///    when it is inexact.
1441 ///
1442 /// \headerfile <x86intrin.h>
1443 ///
1444 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1445 ///   instructions.
1446 ///
1447 /// \param __a
1448 ///    A 128-bit vector of [4 x float].
1449 /// \returns A 64-bit integer vector containing the converted values.
1450 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1451 _mm_cvttps_pi32(__m128 __a)
1452 {
1453   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1454 }
1455 
1456 /// Converts two low-order float values in a 128-bit vector of [4 x
1457 ///    float] into a 64-bit vector of [2 x i32], truncating the result when it
1458 ///    is inexact.
1459 ///
1460 /// \headerfile <x86intrin.h>
1461 ///
1462 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1463 ///
1464 /// \param __a
1465 ///    A 128-bit vector of [4 x float].
1466 /// \returns A 64-bit integer vector containing the converted values.
1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1468 _mm_cvtt_ps2pi(__m128 __a)
1469 {
1470   return _mm_cvttps_pi32(__a);
1471 }
1472 
1473 /// Converts a 32-bit signed integer value into a floating point value
1474 ///    and writes it to the lower 32 bits of the destination. The remaining
1475 ///    higher order elements of the destination vector are copied from the
1476 ///    corresponding elements in the first operand.
1477 ///
1478 /// \headerfile <x86intrin.h>
1479 ///
1480 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1481 ///
1482 /// \param __a
1483 ///    A 128-bit vector of [4 x float].
1484 /// \param __b
1485 ///    A 32-bit signed integer operand containing the value to be converted.
1486 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1487 ///    converted value of the second operand. The upper 96 bits are copied from
1488 ///    the upper 96 bits of the first operand.
1489 static __inline__ __m128 __DEFAULT_FN_ATTRS
1490 _mm_cvtsi32_ss(__m128 __a, int __b)
1491 {
1492   __a[0] = __b;
1493   return __a;
1494 }
1495 
1496 /// Converts a 32-bit signed integer value into a floating point value
1497 ///    and writes it to the lower 32 bits of the destination. The remaining
1498 ///    higher order elements of the destination are copied from the
1499 ///    corresponding elements in the first operand.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1504 ///
1505 /// \param __a
1506 ///    A 128-bit vector of [4 x float].
1507 /// \param __b
1508 ///    A 32-bit signed integer operand containing the value to be converted.
1509 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1510 ///    converted value of the second operand. The upper 96 bits are copied from
1511 ///    the upper 96 bits of the first operand.
1512 static __inline__ __m128 __DEFAULT_FN_ATTRS
1513 _mm_cvt_si2ss(__m128 __a, int __b)
1514 {
1515   return _mm_cvtsi32_ss(__a, __b);
1516 }
1517 
1518 #ifdef __x86_64__
1519 
1520 /// Converts a 64-bit signed integer value into a floating point value
1521 ///    and writes it to the lower 32 bits of the destination. The remaining
1522 ///    higher order elements of the destination are copied from the
1523 ///    corresponding elements in the first operand.
1524 ///
1525 /// \headerfile <x86intrin.h>
1526 ///
1527 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1528 ///
1529 /// \param __a
1530 ///    A 128-bit vector of [4 x float].
1531 /// \param __b
1532 ///    A 64-bit signed integer operand containing the value to be converted.
1533 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1534 ///    converted value of the second operand. The upper 96 bits are copied from
1535 ///    the upper 96 bits of the first operand.
1536 static __inline__ __m128 __DEFAULT_FN_ATTRS
1537 _mm_cvtsi64_ss(__m128 __a, long long __b)
1538 {
1539   __a[0] = __b;
1540   return __a;
1541 }
1542 
1543 #endif
1544 
1545 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1546 ///    floating point values and writes them to the lower 64-bits of the
1547 ///    destination. The remaining higher order elements of the destination are
1548 ///    copied from the corresponding elements in the first operand.
1549 ///
1550 /// \headerfile <x86intrin.h>
1551 ///
1552 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1553 ///
1554 /// \param __a
1555 ///    A 128-bit vector of [4 x float].
1556 /// \param __b
1557 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1558 ///    and written to the corresponding low-order elements in the destination.
1559 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1560 ///    converted value of the second operand. The upper 64 bits are copied from
1561 ///    the upper 64 bits of the first operand.
1562 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1563 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1564 {
1565   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1566 }
1567 
1568 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1569 ///    floating point values and writes them to the lower 64-bits of the
1570 ///    destination. The remaining higher order elements of the destination are
1571 ///    copied from the corresponding elements in the first operand.
1572 ///
1573 /// \headerfile <x86intrin.h>
1574 ///
1575 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1576 ///
1577 /// \param __a
1578 ///    A 128-bit vector of [4 x float].
1579 /// \param __b
1580 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1581 ///    and written to the corresponding low-order elements in the destination.
1582 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1583 ///    converted value from the second operand. The upper 64 bits are copied
1584 ///    from the upper 64 bits of the first operand.
1585 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1586 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1587 {
1588   return _mm_cvtpi32_ps(__a, __b);
1589 }
1590 
1591 /// Extracts a float value contained in the lower 32 bits of a vector of
1592 ///    [4 x float].
1593 ///
1594 /// \headerfile <x86intrin.h>
1595 ///
1596 /// This intrinsic has no corresponding instruction.
1597 ///
1598 /// \param __a
1599 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1600 ///    used in the extraction.
1601 /// \returns A 32-bit float containing the extracted value.
1602 static __inline__ float __DEFAULT_FN_ATTRS
1603 _mm_cvtss_f32(__m128 __a)
1604 {
1605   return __a[0];
1606 }
1607 
1608 /// Loads two packed float values from the address \a __p into the
1609 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1610 ///     are copied from the low-order bits of the first operand.
1611 ///
1612 /// \headerfile <x86intrin.h>
1613 ///
1614 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1615 ///
1616 /// \param __a
1617 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1618 ///    of the destination.
1619 /// \param __p
1620 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1621 ///    [127:64] of the destination.
1622 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1623 static __inline__ __m128 __DEFAULT_FN_ATTRS
1624 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1625 {
1626   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1627   struct __mm_loadh_pi_struct {
1628     __mm_loadh_pi_v2f32 __u;
1629   } __attribute__((__packed__, __may_alias__));
1630   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1631   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1632   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1633 }
1634 
1635 /// Loads two packed float values from the address \a __p into the
1636 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1637 ///    are copied from the high-order bits of the first operand.
1638 ///
1639 /// \headerfile <x86intrin.h>
1640 ///
1641 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1642 ///
1643 /// \param __a
1644 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1645 ///    [127:64] of the destination.
1646 /// \param __p
1647 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1648 ///    [63:0] of the destination.
1649 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1650 static __inline__ __m128 __DEFAULT_FN_ATTRS
1651 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1652 {
1653   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1654   struct __mm_loadl_pi_struct {
1655     __mm_loadl_pi_v2f32 __u;
1656   } __attribute__((__packed__, __may_alias__));
1657   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1658   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1659   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1660 }
1661 
1662 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1663 ///    32 bits of the vector are initialized with the single-precision
1664 ///    floating-point value loaded from a specified memory location. The upper
1665 ///    96 bits are set to zero.
1666 ///
1667 /// \headerfile <x86intrin.h>
1668 ///
1669 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1670 ///
1671 /// \param __p
1672 ///    A pointer to a 32-bit memory location containing a single-precision
1673 ///    floating-point value.
1674 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1675 ///    lower 32 bits contain the value loaded from the memory location. The
1676 ///    upper 96 bits are set to zero.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS
1678 _mm_load_ss(const float *__p)
1679 {
1680   struct __mm_load_ss_struct {
1681     float __u;
1682   } __attribute__((__packed__, __may_alias__));
1683   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1684   return __extension__ (__m128){ __u, 0, 0, 0 };
1685 }
1686 
1687 /// Loads a 32-bit float value and duplicates it to all four vector
1688 ///    elements of a 128-bit vector of [4 x float].
1689 ///
1690 /// \headerfile <x86intrin.h>
1691 ///
1692 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1693 ///    instruction.
1694 ///
1695 /// \param __p
1696 ///    A pointer to a float value to be loaded and duplicated.
1697 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1698 ///    duplicated values.
1699 static __inline__ __m128 __DEFAULT_FN_ATTRS
1700 _mm_load1_ps(const float *__p)
1701 {
1702   struct __mm_load1_ps_struct {
1703     float __u;
1704   } __attribute__((__packed__, __may_alias__));
1705   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1706   return __extension__ (__m128){ __u, __u, __u, __u };
1707 }
1708 
1709 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1710 
1711 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1712 ///    memory location.
1713 ///
1714 /// \headerfile <x86intrin.h>
1715 ///
1716 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1717 ///
1718 /// \param __p
1719 ///    A pointer to a 128-bit memory location. The address of the memory
1720 ///    location has to be 128-bit aligned.
1721 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1722 static __inline__ __m128 __DEFAULT_FN_ATTRS
1723 _mm_load_ps(const float *__p)
1724 {
1725   return *(const __m128*)__p;
1726 }
1727 
1728 /// Loads a 128-bit floating-point vector of [4 x float] from an
1729 ///    unaligned memory location.
1730 ///
1731 /// \headerfile <x86intrin.h>
1732 ///
1733 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1734 ///
1735 /// \param __p
1736 ///    A pointer to a 128-bit memory location. The address of the memory
1737 ///    location does not have to be aligned.
1738 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1739 static __inline__ __m128 __DEFAULT_FN_ATTRS
1740 _mm_loadu_ps(const float *__p)
1741 {
1742   struct __loadu_ps {
1743     __m128_u __v;
1744   } __attribute__((__packed__, __may_alias__));
1745   return ((const struct __loadu_ps*)__p)->__v;
1746 }
1747 
1748 /// Loads four packed float values, in reverse order, from an aligned
1749 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1750 ///
1751 /// \headerfile <x86intrin.h>
1752 ///
1753 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1754 ///    instruction.
1755 ///
1756 /// \param __p
1757 ///    A pointer to a 128-bit memory location. The address of the memory
1758 ///    location has to be 128-bit aligned.
1759 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1760 ///    in reverse order.
1761 static __inline__ __m128 __DEFAULT_FN_ATTRS
1762 _mm_loadr_ps(const float *__p)
1763 {
1764   __m128 __a = _mm_load_ps(__p);
1765   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1766 }
1767 
1768 /// Create a 128-bit vector of [4 x float] with undefined values.
1769 ///
1770 /// \headerfile <x86intrin.h>
1771 ///
1772 /// This intrinsic has no corresponding instruction.
1773 ///
1774 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1775 static __inline__ __m128 __DEFAULT_FN_ATTRS
1776 _mm_undefined_ps(void)
1777 {
1778   return (__m128)__builtin_ia32_undef128();
1779 }
1780 
1781 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1782 ///    32 bits of the vector are initialized with the specified single-precision
1783 ///    floating-point value. The upper 96 bits are set to zero.
1784 ///
1785 /// \headerfile <x86intrin.h>
1786 ///
1787 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1788 ///
1789 /// \param __w
1790 ///    A single-precision floating-point value used to initialize the lower 32
1791 ///    bits of the result.
1792 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1793 ///    lower 32 bits contain the value provided in the source operand. The
1794 ///    upper 96 bits are set to zero.
1795 static __inline__ __m128 __DEFAULT_FN_ATTRS
1796 _mm_set_ss(float __w)
1797 {
1798   return __extension__ (__m128){ __w, 0, 0, 0 };
1799 }
1800 
1801 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1802 ///    of the four single-precision floating-point vector elements set to the
1803 ///    specified single-precision floating-point value.
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1808 ///
1809 /// \param __w
1810 ///    A single-precision floating-point value used to initialize each vector
1811 ///    element of the result.
1812 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1813 static __inline__ __m128 __DEFAULT_FN_ATTRS
1814 _mm_set1_ps(float __w)
1815 {
1816   return __extension__ (__m128){ __w, __w, __w, __w };
1817 }
1818 
1819 /* Microsoft specific. */
1820 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1821 ///    of the four single-precision floating-point vector elements set to the
1822 ///    specified single-precision floating-point value.
1823 ///
1824 /// \headerfile <x86intrin.h>
1825 ///
1826 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1827 ///
1828 /// \param __w
1829 ///    A single-precision floating-point value used to initialize each vector
1830 ///    element of the result.
1831 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1832 static __inline__ __m128 __DEFAULT_FN_ATTRS
1833 _mm_set_ps1(float __w)
1834 {
1835     return _mm_set1_ps(__w);
1836 }
1837 
1838 /// Constructs a 128-bit floating-point vector of [4 x float]
1839 ///    initialized with the specified single-precision floating-point values.
1840 ///
1841 /// \headerfile <x86intrin.h>
1842 ///
1843 /// This intrinsic is a utility function and does not correspond to a specific
1844 ///    instruction.
1845 ///
1846 /// \param __z
1847 ///    A single-precision floating-point value used to initialize bits [127:96]
1848 ///    of the result.
1849 /// \param __y
1850 ///    A single-precision floating-point value used to initialize bits [95:64]
1851 ///    of the result.
1852 /// \param __x
1853 ///    A single-precision floating-point value used to initialize bits [63:32]
1854 ///    of the result.
1855 /// \param __w
1856 ///    A single-precision floating-point value used to initialize bits [31:0]
1857 ///    of the result.
1858 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1859 static __inline__ __m128 __DEFAULT_FN_ATTRS
1860 _mm_set_ps(float __z, float __y, float __x, float __w)
1861 {
1862   return __extension__ (__m128){ __w, __x, __y, __z };
1863 }
1864 
1865 /// Constructs a 128-bit floating-point vector of [4 x float],
1866 ///    initialized in reverse order with the specified 32-bit single-precision
1867 ///    float-point values.
1868 ///
1869 /// \headerfile <x86intrin.h>
1870 ///
1871 /// This intrinsic is a utility function and does not correspond to a specific
1872 ///    instruction.
1873 ///
1874 /// \param __z
1875 ///    A single-precision floating-point value used to initialize bits [31:0]
1876 ///    of the result.
1877 /// \param __y
1878 ///    A single-precision floating-point value used to initialize bits [63:32]
1879 ///    of the result.
1880 /// \param __x
1881 ///    A single-precision floating-point value used to initialize bits [95:64]
1882 ///    of the result.
1883 /// \param __w
1884 ///    A single-precision floating-point value used to initialize bits [127:96]
1885 ///    of the result.
1886 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1887 static __inline__ __m128 __DEFAULT_FN_ATTRS
1888 _mm_setr_ps(float __z, float __y, float __x, float __w)
1889 {
1890   return __extension__ (__m128){ __z, __y, __x, __w };
1891 }
1892 
1893 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
1894 ///    to zero.
1895 ///
1896 /// \headerfile <x86intrin.h>
1897 ///
1898 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1899 ///
1900 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
1901 ///    all elements set to zero.
1902 static __inline__ __m128 __DEFAULT_FN_ATTRS
1903 _mm_setzero_ps(void)
1904 {
1905   return __extension__ (__m128){ 0, 0, 0, 0 };
1906 }
1907 
1908 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
1909 ///    memory location.
1910 ///
1911 /// \headerfile <x86intrin.h>
1912 ///
1913 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
1914 ///
1915 /// \param __p
1916 ///    A pointer to a 64-bit memory location.
1917 /// \param __a
1918 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1919 static __inline__ void __DEFAULT_FN_ATTRS
1920 _mm_storeh_pi(__m64 *__p, __m128 __a)
1921 {
1922   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1923   struct __mm_storeh_pi_struct {
1924     __mm_storeh_pi_v2f32 __u;
1925   } __attribute__((__packed__, __may_alias__));
1926   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
1927 }
1928 
1929 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
1930 ///     memory location.
1931 ///
1932 /// \headerfile <x86intrin.h>
1933 ///
1934 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
1935 ///
1936 /// \param __p
1937 ///    A pointer to a memory location that will receive the float values.
1938 /// \param __a
1939 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1940 static __inline__ void __DEFAULT_FN_ATTRS
1941 _mm_storel_pi(__m64 *__p, __m128 __a)
1942 {
1943   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
1944   struct __mm_storeh_pi_struct {
1945     __mm_storeh_pi_v2f32 __u;
1946   } __attribute__((__packed__, __may_alias__));
1947   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
1948 }
1949 
1950 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
1951 ///     memory location.
1952 ///
1953 /// \headerfile <x86intrin.h>
1954 ///
1955 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1956 ///
1957 /// \param __p
1958 ///    A pointer to a 32-bit memory location.
1959 /// \param __a
1960 ///    A 128-bit vector of [4 x float] containing the value to be stored.
1961 static __inline__ void __DEFAULT_FN_ATTRS
1962 _mm_store_ss(float *__p, __m128 __a)
1963 {
1964   struct __mm_store_ss_struct {
1965     float __u;
1966   } __attribute__((__packed__, __may_alias__));
1967   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
1968 }
1969 
1970 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
1971 ///    location.
1972 ///
1973 /// \headerfile <x86intrin.h>
1974 ///
1975 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1976 ///
1977 /// \param __p
1978 ///    A pointer to a 128-bit memory location. The address of the memory
1979 ///    location does not have to be aligned.
1980 /// \param __a
1981 ///    A 128-bit vector of [4 x float] containing the values to be stored.
1982 static __inline__ void __DEFAULT_FN_ATTRS
1983 _mm_storeu_ps(float *__p, __m128 __a)
1984 {
1985   struct __storeu_ps {
1986     __m128_u __v;
1987   } __attribute__((__packed__, __may_alias__));
1988   ((struct __storeu_ps*)__p)->__v = __a;
1989 }
1990 
1991 /// Stores a 128-bit vector of [4 x float] into an aligned memory
1992 ///    location.
1993 ///
1994 /// \headerfile <x86intrin.h>
1995 ///
1996 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1997 ///
1998 /// \param __p
1999 ///    A pointer to a 128-bit memory location. The address of the memory
2000 ///    location has to be 16-byte aligned.
2001 /// \param __a
2002 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2003 static __inline__ void __DEFAULT_FN_ATTRS
2004 _mm_store_ps(float *__p, __m128 __a)
2005 {
2006   *(__m128*)__p = __a;
2007 }
2008 
2009 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2010 ///    four contiguous elements in an aligned memory location.
2011 ///
2012 /// \headerfile <x86intrin.h>
2013 ///
2014 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2015 ///    instruction.
2016 ///
2017 /// \param __p
2018 ///    A pointer to a 128-bit memory location.
2019 /// \param __a
2020 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2021 ///    of the four contiguous elements pointed by \a __p.
2022 static __inline__ void __DEFAULT_FN_ATTRS
2023 _mm_store1_ps(float *__p, __m128 __a)
2024 {
2025   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2026   _mm_store_ps(__p, __a);
2027 }
2028 
2029 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2030 ///    four contiguous elements in an aligned memory location.
2031 ///
2032 /// \headerfile <x86intrin.h>
2033 ///
2034 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2035 ///    instruction.
2036 ///
2037 /// \param __p
2038 ///    A pointer to a 128-bit memory location.
2039 /// \param __a
2040 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2041 ///    of the four contiguous elements pointed by \a __p.
2042 static __inline__ void __DEFAULT_FN_ATTRS
2043 _mm_store_ps1(float *__p, __m128 __a)
2044 {
2045   _mm_store1_ps(__p, __a);
2046 }
2047 
2048 /// Stores float values from a 128-bit vector of [4 x float] to an
2049 ///    aligned memory location in reverse order.
2050 ///
2051 /// \headerfile <x86intrin.h>
2052 ///
2053 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2054 ///    instruction.
2055 ///
2056 /// \param __p
2057 ///    A pointer to a 128-bit memory location. The address of the memory
2058 ///    location has to be 128-bit aligned.
2059 /// \param __a
2060 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2061 static __inline__ void __DEFAULT_FN_ATTRS
2062 _mm_storer_ps(float *__p, __m128 __a)
2063 {
2064   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2065   _mm_store_ps(__p, __a);
2066 }
2067 
2068 #define _MM_HINT_ET0 7
2069 #define _MM_HINT_ET1 6
2070 #define _MM_HINT_T0  3
2071 #define _MM_HINT_T1  2
2072 #define _MM_HINT_T2  1
2073 #define _MM_HINT_NTA 0
2074 
2075 #ifndef _MSC_VER
2076 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2077    Sema doesn't do any form of constant propagation yet. */
2078 
2079 /// Loads one cache line of data from the specified address to a location
2080 ///    closer to the processor.
2081 ///
2082 /// \headerfile <x86intrin.h>
2083 ///
2084 /// \code
2085 /// void _mm_prefetch(const void * a, const int sel);
2086 /// \endcode
2087 ///
2088 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2089 ///
2090 /// \param a
2091 ///    A pointer to a memory location containing a cache line of data.
2092 /// \param sel
2093 ///    A predefined integer constant specifying the type of prefetch
2094 ///    operation: \n
2095 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2096 ///    PREFETCHNTA instruction will be generated. \n
2097 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2098 ///    be generated. \n
2099 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2100 ///    be generated. \n
2101 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2102 ///    be generated.
2103 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2104                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2105 #endif
2106 
2107 /// Stores a 64-bit integer in the specified aligned memory location. To
2108 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2109 ///    used again soon).
2110 ///
2111 /// \headerfile <x86intrin.h>
2112 ///
2113 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2114 ///
2115 /// \param __p
2116 ///    A pointer to an aligned memory location used to store the register value.
2117 /// \param __a
2118 ///    A 64-bit integer containing the value to be stored.
2119 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2120 _mm_stream_pi(__m64 *__p, __m64 __a)
2121 {
2122   __builtin_ia32_movntq(__p, __a);
2123 }
2124 
2125 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2126 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2127 ///    as non-temporal (unlikely to be used again soon).
2128 ///
2129 /// \headerfile <x86intrin.h>
2130 ///
2131 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2132 ///
2133 /// \param __p
2134 ///    A pointer to a 128-bit aligned memory location that will receive the
2135 ///    single-precision floating-point values.
2136 /// \param __a
2137 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2138 static __inline__ void __DEFAULT_FN_ATTRS
2139 _mm_stream_ps(float *__p, __m128 __a)
2140 {
2141   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2142 }
2143 
2144 #if defined(__cplusplus)
2145 extern "C" {
2146 #endif
2147 
2148 /// Forces strong memory ordering (serialization) between store
2149 ///    instructions preceding this instruction and store instructions following
2150 ///    this instruction, ensuring the system completes all previous stores
2151 ///    before executing subsequent stores.
2152 ///
2153 /// \headerfile <x86intrin.h>
2154 ///
2155 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2156 ///
2157 void _mm_sfence(void);
2158 
2159 #if defined(__cplusplus)
2160 } // extern "C"
2161 #endif
2162 
2163 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2164 ///    returns it, as specified by the immediate integer operand.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// \code
2169 /// int _mm_extract_pi16(__m64 a, int n);
2170 /// \endcode
2171 ///
2172 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2173 ///
2174 /// \param a
2175 ///    A 64-bit vector of [4 x i16].
2176 /// \param n
2177 ///    An immediate integer operand that determines which bits are extracted: \n
2178 ///    0: Bits [15:0] are copied to the destination. \n
2179 ///    1: Bits [31:16] are copied to the destination. \n
2180 ///    2: Bits [47:32] are copied to the destination. \n
2181 ///    3: Bits [63:48] are copied to the destination.
2182 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2183 #define _mm_extract_pi16(a, n) \
2184   (int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)
2185 
2186 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2187 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2188 ///    specified by the immediate operand \a n.
2189 ///
2190 /// \headerfile <x86intrin.h>
2191 ///
2192 /// \code
2193 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2194 /// \endcode
2195 ///
2196 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2197 ///
2198 /// \param a
2199 ///    A 64-bit vector of [4 x i16].
2200 /// \param d
2201 ///    An integer. The lower 16-bit value from this operand is written to the
2202 ///    destination at the offset specified by operand \a n.
2203 /// \param n
2204 ///    An immediate integer operant that determines which the bits to be used
2205 ///    in the destination. \n
2206 ///    0: Bits [15:0] are copied to the destination. \n
2207 ///    1: Bits [31:16] are copied to the destination. \n
2208 ///    2: Bits [47:32] are copied to the destination. \n
2209 ///    3: Bits [63:48] are copied to the destination.  \n
2210 ///    The remaining bits in the destination are copied from the corresponding
2211 ///    bits in operand \a a.
2212 /// \returns A 64-bit integer vector containing the copied packed data from the
2213 ///    operands.
2214 #define _mm_insert_pi16(a, d, n) \
2215   (__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)
2216 
2217 /// Compares each of the corresponding packed 16-bit integer values of
2218 ///    the 64-bit integer vectors, and writes the greater value to the
2219 ///    corresponding bits in the destination.
2220 ///
2221 /// \headerfile <x86intrin.h>
2222 ///
2223 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2224 ///
2225 /// \param __a
2226 ///    A 64-bit integer vector containing one of the source operands.
2227 /// \param __b
2228 ///    A 64-bit integer vector containing one of the source operands.
2229 /// \returns A 64-bit integer vector containing the comparison results.
2230 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2231 _mm_max_pi16(__m64 __a, __m64 __b)
2232 {
2233   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2234 }
2235 
2236 /// Compares each of the corresponding packed 8-bit unsigned integer
2237 ///    values of the 64-bit integer vectors, and writes the greater value to the
2238 ///    corresponding bits in the destination.
2239 ///
2240 /// \headerfile <x86intrin.h>
2241 ///
2242 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2243 ///
2244 /// \param __a
2245 ///    A 64-bit integer vector containing one of the source operands.
2246 /// \param __b
2247 ///    A 64-bit integer vector containing one of the source operands.
2248 /// \returns A 64-bit integer vector containing the comparison results.
2249 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2250 _mm_max_pu8(__m64 __a, __m64 __b)
2251 {
2252   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2253 }
2254 
2255 /// Compares each of the corresponding packed 16-bit integer values of
2256 ///    the 64-bit integer vectors, and writes the lesser value to the
2257 ///    corresponding bits in the destination.
2258 ///
2259 /// \headerfile <x86intrin.h>
2260 ///
2261 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2262 ///
2263 /// \param __a
2264 ///    A 64-bit integer vector containing one of the source operands.
2265 /// \param __b
2266 ///    A 64-bit integer vector containing one of the source operands.
2267 /// \returns A 64-bit integer vector containing the comparison results.
2268 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2269 _mm_min_pi16(__m64 __a, __m64 __b)
2270 {
2271   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2272 }
2273 
2274 /// Compares each of the corresponding packed 8-bit unsigned integer
2275 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2276 ///    corresponding bits in the destination.
2277 ///
2278 /// \headerfile <x86intrin.h>
2279 ///
2280 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2281 ///
2282 /// \param __a
2283 ///    A 64-bit integer vector containing one of the source operands.
2284 /// \param __b
2285 ///    A 64-bit integer vector containing one of the source operands.
2286 /// \returns A 64-bit integer vector containing the comparison results.
2287 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2288 _mm_min_pu8(__m64 __a, __m64 __b)
2289 {
2290   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2291 }
2292 
2293 /// Takes the most significant bit from each 8-bit element in a 64-bit
2294 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2295 ///    32-bit integer and writes it to the destination.
2296 ///
2297 /// \headerfile <x86intrin.h>
2298 ///
2299 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2300 ///
2301 /// \param __a
2302 ///    A 64-bit integer vector containing the values with bits to be extracted.
2303 /// \returns The most significant bit from each 8-bit element in \a __a,
2304 ///    written to bits [7:0].
2305 static __inline__ int __DEFAULT_FN_ATTRS_MMX
2306 _mm_movemask_pi8(__m64 __a)
2307 {
2308   return __builtin_ia32_pmovmskb((__v8qi)__a);
2309 }
2310 
2311 /// Multiplies packed 16-bit unsigned integer values and writes the
2312 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2313 ///    the destination.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2318 ///
2319 /// \param __a
2320 ///    A 64-bit integer vector containing one of the source operands.
2321 /// \param __b
2322 ///    A 64-bit integer vector containing one of the source operands.
2323 /// \returns A 64-bit integer vector containing the products of both operands.
2324 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2325 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2326 {
2327   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2328 }
2329 
2330 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2331 ///    destination, as specified by the immediate value operand.
2332 ///
2333 /// \headerfile <x86intrin.h>
2334 ///
2335 /// \code
2336 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2337 /// \endcode
2338 ///
2339 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2340 ///
2341 /// \param a
2342 ///    A 64-bit integer vector containing the values to be shuffled.
2343 /// \param n
2344 ///    An immediate value containing an 8-bit value specifying which elements to
2345 ///    copy from \a a. The destinations within the 64-bit destination are
2346 ///    assigned values as follows: \n
2347 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2348 ///    destination. \n
2349 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2350 ///    destination. \n
2351 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2352 ///    destination. \n
2353 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2354 ///    destination. \n
2355 ///    Bit value assignments: \n
2356 ///    00: assigned from bits [15:0] of \a a. \n
2357 ///    01: assigned from bits [31:16] of \a a. \n
2358 ///    10: assigned from bits [47:32] of \a a. \n
2359 ///    11: assigned from bits [63:48] of \a a.
2360 /// \returns A 64-bit integer vector containing the shuffled values.
2361 #define _mm_shuffle_pi16(a, n) \
2362   (__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))
2363 
2364 /// Conditionally copies the values from each 8-bit element in the first
2365 ///    64-bit integer vector operand to the specified memory location, as
2366 ///    specified by the most significant bit in the corresponding element in the
2367 ///    second 64-bit integer vector operand.
2368 ///
2369 ///    To minimize caching, the data is flagged as non-temporal
2370 ///    (unlikely to be used again soon).
2371 ///
2372 /// \headerfile <x86intrin.h>
2373 ///
2374 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2375 ///
2376 /// \param __d
2377 ///    A 64-bit integer vector containing the values with elements to be copied.
2378 /// \param __n
2379 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2380 ///    element determines whether the corresponding element in operand \a __d
2381 ///    is copied. If the most significant bit of a given element is 1, the
2382 ///    corresponding element in operand \a __d is copied.
2383 /// \param __p
2384 ///    A pointer to a 64-bit memory location that will receive the conditionally
2385 ///    copied integer values. The address of the memory location does not have
2386 ///    to be aligned.
2387 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2388 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2389 {
2390   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2391 }
2392 
2393 /// Computes the rounded averages of the packed unsigned 8-bit integer
2394 ///    values and writes the averages to the corresponding bits in the
2395 ///    destination.
2396 ///
2397 /// \headerfile <x86intrin.h>
2398 ///
2399 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2400 ///
2401 /// \param __a
2402 ///    A 64-bit integer vector containing one of the source operands.
2403 /// \param __b
2404 ///    A 64-bit integer vector containing one of the source operands.
2405 /// \returns A 64-bit integer vector containing the averages of both operands.
2406 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2407 _mm_avg_pu8(__m64 __a, __m64 __b)
2408 {
2409   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2410 }
2411 
2412 /// Computes the rounded averages of the packed unsigned 16-bit integer
2413 ///    values and writes the averages to the corresponding bits in the
2414 ///    destination.
2415 ///
2416 /// \headerfile <x86intrin.h>
2417 ///
2418 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2419 ///
2420 /// \param __a
2421 ///    A 64-bit integer vector containing one of the source operands.
2422 /// \param __b
2423 ///    A 64-bit integer vector containing one of the source operands.
2424 /// \returns A 64-bit integer vector containing the averages of both operands.
2425 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2426 _mm_avg_pu16(__m64 __a, __m64 __b)
2427 {
2428   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2429 }
2430 
2431 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2432 ///    64-bit vector operands and computes the absolute value for each of the
2433 ///    difference. Then sum of the 8 absolute differences is written to the
2434 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2435 ///
2436 /// \headerfile <x86intrin.h>
2437 ///
2438 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2439 ///
2440 /// \param __a
2441 ///    A 64-bit integer vector containing one of the source operands.
2442 /// \param __b
2443 ///    A 64-bit integer vector containing one of the source operands.
2444 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2445 ///    sets of absolute differences between both operands. The upper bits are
2446 ///    cleared.
2447 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2448 _mm_sad_pu8(__m64 __a, __m64 __b)
2449 {
2450   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2451 }
2452 
2453 #if defined(__cplusplus)
2454 extern "C" {
2455 #endif
2456 
2457 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2458 ///    integer value.
2459 ///
2460 ///    There are several groups of macros associated with this
2461 ///    intrinsic, including:
2462 ///    <ul>
2463 ///    <li>
2464 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2465 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2466 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2467 ///      _MM_GET_EXCEPTION_STATE().
2468 ///    </li>
2469 ///    <li>
2470 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2471 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2472 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2473 ///    </li>
2474 ///    <li>
2475 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2476 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2477 ///      _MM_GET_ROUNDING_MODE().
2478 ///    </li>
2479 ///    <li>
2480 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2481 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2482 ///    </li>
2483 ///    <li>
2484 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2485 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2486 ///      _MM_GET_DENORMALS_ZERO_MODE().
2487 ///    </li>
2488 ///    </ul>
2489 ///
2490 ///    For example, the following expression checks if an overflow exception has
2491 ///    occurred:
2492 ///    \code
2493 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2494 ///    \endcode
2495 ///
2496 ///    The following expression gets the current rounding mode:
2497 ///    \code
2498 ///      _MM_GET_ROUNDING_MODE()
2499 ///    \endcode
2500 ///
2501 /// \headerfile <x86intrin.h>
2502 ///
2503 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2504 ///
2505 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2506 ///    register.
2507 unsigned int _mm_getcsr(void);
2508 
2509 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2510 ///
2511 ///    There are several groups of macros associated with this intrinsic,
2512 ///    including:
2513 ///    <ul>
2514 ///    <li>
2515 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2516 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2517 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2518 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2519 ///    </li>
2520 ///    <li>
2521 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2522 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2523 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2524 ///      of these macros.
2525 ///    </li>
2526 ///    <li>
2527 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2528 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2529 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2530 ///    </li>
2531 ///    <li>
2532 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2533 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2534 ///      one of these macros.
2535 ///    </li>
2536 ///    <li>
2537 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2538 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2539 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2540 ///    </li>
2541 ///    </ul>
2542 ///
2543 ///    For example, the following expression causes subsequent floating-point
2544 ///    operations to round up:
2545 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2546 ///
2547 ///    The following example sets the DAZ and FTZ flags:
2548 ///    \code
2549 ///    void setFlags() {
2550 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2551 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2552 ///    }
2553 ///    \endcode
2554 ///
2555 /// \headerfile <x86intrin.h>
2556 ///
2557 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2558 ///
2559 /// \param __i
2560 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2561 void _mm_setcsr(unsigned int __i);
2562 
2563 #if defined(__cplusplus)
2564 } // extern "C"
2565 #endif
2566 
2567 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2568 ///    specified by the immediate value operand.
2569 ///
2570 /// \headerfile <x86intrin.h>
2571 ///
2572 /// \code
2573 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2574 /// \endcode
2575 ///
2576 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2577 ///
2578 /// \param a
2579 ///    A 128-bit vector of [4 x float].
2580 /// \param b
2581 ///    A 128-bit vector of [4 x float].
2582 /// \param mask
2583 ///    An immediate value containing an 8-bit value specifying which elements to
2584 ///    copy from \a a and \a b. \n
2585 ///    Bits [3:0] specify the values copied from operand \a a. \n
2586 ///    Bits [7:4] specify the values copied from operand \a b. \n
2587 ///    The destinations within the 128-bit destination are assigned values as
2588 ///    follows: \n
2589 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2590 ///    destination. \n
2591 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2592 ///    destination. \n
2593 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2594 ///    destination. \n
2595 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2596 ///    destination. \n
2597 ///    Bit value assignments: \n
2598 ///    00: Bits [31:0] copied from the specified operand. \n
2599 ///    01: Bits [63:32] copied from the specified operand. \n
2600 ///    10: Bits [95:64] copied from the specified operand. \n
2601 ///    11: Bits [127:96] copied from the specified operand.
2602 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2603 #define _mm_shuffle_ps(a, b, mask) \
2604   (__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2605                                 (int)(mask))
2606 
2607 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2608 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2609 ///
2610 /// \headerfile <x86intrin.h>
2611 ///
2612 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2613 ///
2614 /// \param __a
2615 ///    A 128-bit vector of [4 x float]. \n
2616 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2617 ///    Bits [127:96] are written to bits [95:64] of the destination.
2618 /// \param __b
2619 ///    A 128-bit vector of [4 x float].
2620 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2621 ///    Bits [127:96] are written to bits [127:96] of the destination.
2622 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2623 static __inline__ __m128 __DEFAULT_FN_ATTRS
2624 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2625 {
2626   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2627 }
2628 
2629 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2630 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2631 ///
2632 /// \headerfile <x86intrin.h>
2633 ///
2634 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2635 ///
2636 /// \param __a
2637 ///    A 128-bit vector of [4 x float]. \n
2638 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2639 ///    Bits [63:32] are written to bits [95:64] of the destination.
2640 /// \param __b
2641 ///    A 128-bit vector of [4 x float]. \n
2642 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2643 ///    Bits [63:32] are written to bits [127:96] of the destination.
2644 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2645 static __inline__ __m128 __DEFAULT_FN_ATTRS
2646 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2647 {
2648   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2649 }
2650 
2651 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2652 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2653 ///    96 bits are set to the upper 96 bits of the first parameter.
2654 ///
2655 /// \headerfile <x86intrin.h>
2656 ///
2657 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2658 ///    instruction.
2659 ///
2660 /// \param __a
2661 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2662 ///    written to the upper 96 bits of the result.
2663 /// \param __b
2664 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2665 ///    written to the lower 32 bits of the result.
2666 /// \returns A 128-bit floating-point vector of [4 x float].
2667 static __inline__ __m128 __DEFAULT_FN_ATTRS
2668 _mm_move_ss(__m128 __a, __m128 __b)
2669 {
2670   __a[0] = __b[0];
2671   return __a;
2672 }
2673 
2674 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2675 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2676 ///    64 bits are set to the upper 64 bits of the first parameter.
2677 ///
2678 /// \headerfile <x86intrin.h>
2679 ///
2680 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2681 ///
2682 /// \param __a
2683 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2684 ///    written to the upper 64 bits of the result.
2685 /// \param __b
2686 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2687 ///    written to the lower 64 bits of the result.
2688 /// \returns A 128-bit floating-point vector of [4 x float].
2689 static __inline__ __m128 __DEFAULT_FN_ATTRS
2690 _mm_movehl_ps(__m128 __a, __m128 __b)
2691 {
2692   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2693 }
2694 
2695 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2696 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2697 ///    64 bits are set to the lower 64 bits of the second parameter.
2698 ///
2699 /// \headerfile <x86intrin.h>
2700 ///
2701 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2702 ///
2703 /// \param __a
2704 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2705 ///    written to the lower 64 bits of the result.
2706 /// \param __b
2707 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2708 ///    written to the upper 64 bits of the result.
2709 /// \returns A 128-bit floating-point vector of [4 x float].
2710 static __inline__ __m128 __DEFAULT_FN_ATTRS
2711 _mm_movelh_ps(__m128 __a, __m128 __b)
2712 {
2713   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2714 }
2715 
2716 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2717 ///    float].
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2722 ///
2723 /// \param __a
2724 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2725 ///    from the corresponding elements in this operand.
2726 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2727 ///    values from the operand.
2728 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2729 _mm_cvtpi16_ps(__m64 __a)
2730 {
2731   __m64 __b, __c;
2732   __m128 __r;
2733 
2734   __b = _mm_setzero_si64();
2735   __b = _mm_cmpgt_pi16(__b, __a);
2736   __c = _mm_unpackhi_pi16(__a, __b);
2737   __r = _mm_setzero_ps();
2738   __r = _mm_cvtpi32_ps(__r, __c);
2739   __r = _mm_movelh_ps(__r, __r);
2740   __c = _mm_unpacklo_pi16(__a, __b);
2741   __r = _mm_cvtpi32_ps(__r, __c);
2742 
2743   return __r;
2744 }
2745 
2746 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2747 ///    128-bit vector of [4 x float].
2748 ///
2749 /// \headerfile <x86intrin.h>
2750 ///
2751 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2752 ///
2753 /// \param __a
2754 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2755 ///    destination are copied from the corresponding elements in this operand.
2756 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2757 ///    values from the operand.
2758 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2759 _mm_cvtpu16_ps(__m64 __a)
2760 {
2761   __m64 __b, __c;
2762   __m128 __r;
2763 
2764   __b = _mm_setzero_si64();
2765   __c = _mm_unpackhi_pi16(__a, __b);
2766   __r = _mm_setzero_ps();
2767   __r = _mm_cvtpi32_ps(__r, __c);
2768   __r = _mm_movelh_ps(__r, __r);
2769   __c = _mm_unpacklo_pi16(__a, __b);
2770   __r = _mm_cvtpi32_ps(__r, __c);
2771 
2772   return __r;
2773 }
2774 
2775 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2776 ///    into a 128-bit vector of [4 x float].
2777 ///
2778 /// \headerfile <x86intrin.h>
2779 ///
2780 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2781 ///
2782 /// \param __a
2783 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2784 ///    from the corresponding lower 4 elements in this operand.
2785 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2786 ///    values from the operand.
2787 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2788 _mm_cvtpi8_ps(__m64 __a)
2789 {
2790   __m64 __b;
2791 
2792   __b = _mm_setzero_si64();
2793   __b = _mm_cmpgt_pi8(__b, __a);
2794   __b = _mm_unpacklo_pi8(__a, __b);
2795 
2796   return _mm_cvtpi16_ps(__b);
2797 }
2798 
2799 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2800 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2801 ///
2802 /// \headerfile <x86intrin.h>
2803 ///
2804 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2805 ///
2806 /// \param __a
2807 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2808 ///    destination are copied from the corresponding lower 4 elements in this
2809 ///    operand.
2810 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2811 ///    values from the source operand.
2812 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2813 _mm_cvtpu8_ps(__m64 __a)
2814 {
2815   __m64 __b;
2816 
2817   __b = _mm_setzero_si64();
2818   __b = _mm_unpacklo_pi8(__a, __b);
2819 
2820   return _mm_cvtpi16_ps(__b);
2821 }
2822 
2823 /// Converts the two 32-bit signed integer values from each 64-bit vector
2824 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2825 ///
2826 /// \headerfile <x86intrin.h>
2827 ///
2828 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2829 ///
2830 /// \param __a
2831 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2832 ///    copied from the elements in this operand.
2833 /// \param __b
2834 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2835 ///    copied from the elements in this operand.
2836 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2837 ///    copied and converted values from the first operand. The upper 64 bits
2838 ///    contain the copied and converted values from the second operand.
2839 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2840 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2841 {
2842   __m128 __c;
2843 
2844   __c = _mm_setzero_ps();
2845   __c = _mm_cvtpi32_ps(__c, __b);
2846   __c = _mm_movelh_ps(__c, __c);
2847 
2848   return _mm_cvtpi32_ps(__c, __a);
2849 }
2850 
2851 /// Converts each single-precision floating-point element of a 128-bit
2852 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2853 ///    packs the results into a 64-bit integer vector of [4 x i16].
2854 ///
2855 ///    If the floating-point element is NaN or infinity, or if the
2856 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2857 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2858 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2859 ///
2860 /// \headerfile <x86intrin.h>
2861 ///
2862 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2863 ///
2864 /// \param __a
2865 ///    A 128-bit floating-point vector of [4 x float].
2866 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2867 ///    values.
2868 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2869 _mm_cvtps_pi16(__m128 __a)
2870 {
2871   __m64 __b, __c;
2872 
2873   __b = _mm_cvtps_pi32(__a);
2874   __a = _mm_movehl_ps(__a, __a);
2875   __c = _mm_cvtps_pi32(__a);
2876 
2877   return _mm_packs_pi32(__b, __c);
2878 }
2879 
2880 /// Converts each single-precision floating-point element of a 128-bit
2881 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2882 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2883 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2884 ///
2885 ///    If the floating-point element is NaN or infinity, or if the
2886 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2887 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2888 ///    than 0x7F, it is converted to 0x7F.
2889 ///
2890 /// \headerfile <x86intrin.h>
2891 ///
2892 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2893 ///
2894 /// \param __a
2895 ///    128-bit floating-point vector of [4 x float].
2896 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
2897 ///    converted values and the uppper 32 bits are set to zero.
2898 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2899 _mm_cvtps_pi8(__m128 __a)
2900 {
2901   __m64 __b, __c;
2902 
2903   __b = _mm_cvtps_pi16(__a);
2904   __c = _mm_setzero_si64();
2905 
2906   return _mm_packs_pi16(__b, __c);
2907 }
2908 
2909 /// Extracts the sign bits from each single-precision floating-point
2910 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
2911 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
2912 ///    to zero.
2913 ///
2914 /// \headerfile <x86intrin.h>
2915 ///
2916 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
2917 ///
2918 /// \param __a
2919 ///    A 128-bit floating-point vector of [4 x float].
2920 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
2921 ///    single-precision floating-point element of the parameter. Bits [31:4] are
2922 ///    set to zero.
2923 static __inline__ int __DEFAULT_FN_ATTRS
2924 _mm_movemask_ps(__m128 __a)
2925 {
2926   return __builtin_ia32_movmskps((__v4sf)__a);
2927 }
2928 
2929 
2930 #define _MM_ALIGN16 __attribute__((aligned(16)))
2931 
2932 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2933 
2934 #define _MM_EXCEPT_INVALID    (0x0001U)
2935 #define _MM_EXCEPT_DENORM     (0x0002U)
2936 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
2937 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
2938 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
2939 #define _MM_EXCEPT_INEXACT    (0x0020U)
2940 #define _MM_EXCEPT_MASK       (0x003fU)
2941 
2942 #define _MM_MASK_INVALID      (0x0080U)
2943 #define _MM_MASK_DENORM       (0x0100U)
2944 #define _MM_MASK_DIV_ZERO     (0x0200U)
2945 #define _MM_MASK_OVERFLOW     (0x0400U)
2946 #define _MM_MASK_UNDERFLOW    (0x0800U)
2947 #define _MM_MASK_INEXACT      (0x1000U)
2948 #define _MM_MASK_MASK         (0x1f80U)
2949 
2950 #define _MM_ROUND_NEAREST     (0x0000U)
2951 #define _MM_ROUND_DOWN        (0x2000U)
2952 #define _MM_ROUND_UP          (0x4000U)
2953 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
2954 #define _MM_ROUND_MASK        (0x6000U)
2955 
2956 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
2957 #define _MM_FLUSH_ZERO_ON     (0x8000U)
2958 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
2959 
2960 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
2961 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
2962 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
2963 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
2964 
2965 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
2966 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
2967 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
2968 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
2969 
2970 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2971 do { \
2972   __m128 tmp3, tmp2, tmp1, tmp0; \
2973   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
2974   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
2975   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
2976   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
2977   (row0) = _mm_movelh_ps(tmp0, tmp2); \
2978   (row1) = _mm_movehl_ps(tmp2, tmp0); \
2979   (row2) = _mm_movelh_ps(tmp1, tmp3); \
2980   (row3) = _mm_movehl_ps(tmp3, tmp1); \
2981 } while (0)
2982 
2983 /* Aliases for compatibility. */
2984 #define _m_pextrw _mm_extract_pi16
2985 #define _m_pinsrw _mm_insert_pi16
2986 #define _m_pmaxsw _mm_max_pi16
2987 #define _m_pmaxub _mm_max_pu8
2988 #define _m_pminsw _mm_min_pi16
2989 #define _m_pminub _mm_min_pu8
2990 #define _m_pmovmskb _mm_movemask_pi8
2991 #define _m_pmulhuw _mm_mulhi_pu16
2992 #define _m_pshufw _mm_shuffle_pi16
2993 #define _m_maskmovq _mm_maskmove_si64
2994 #define _m_pavgb _mm_avg_pu8
2995 #define _m_pavgw _mm_avg_pu16
2996 #define _m_psadbw _mm_sad_pu8
2997 #define _m_ _mm_
2998 #define _m_ _mm_
2999 
3000 #undef __DEFAULT_FN_ATTRS
3001 #undef __DEFAULT_FN_ATTRS_MMX
3002 
3003 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3004 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3005 #include <emmintrin.h>
3006 #endif
3007 
3008 #endif /* __XMMINTRIN_H */
3009