xref: /freebsd/contrib/llvm-project/clang/lib/Headers/xmmintrin.h (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <mmintrin.h>
18 
19 typedef int __v4si __attribute__((__vector_size__(16)));
20 typedef float __v4sf __attribute__((__vector_size__(16)));
21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22 
23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24 
25 /* Unsigned types */
26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27 
28 /* This header should only be included in a hosted environment as it depends on
29  * a standard library to provide allocation routines. */
30 #if __STDC_HOSTED__
31 #include <mm_malloc.h>
32 #endif
33 
34 /* Define the default attributes for the functions in this file. */
35 #define __DEFAULT_FN_ATTRS                                                     \
36   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37                  __min_vector_width__(128)))
38 #define __DEFAULT_FN_ATTRS_MMX                                                 \
39   __attribute__((__always_inline__, __nodebug__,                               \
40                  __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
41 
42 /// Adds the 32-bit float values in the low-order bits of the operands.
43 ///
44 /// \headerfile <x86intrin.h>
45 ///
46 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
47 ///
48 /// \param __a
49 ///    A 128-bit vector of [4 x float] containing one of the source operands.
50 ///    The lower 32 bits of this operand are used in the calculation.
51 /// \param __b
52 ///    A 128-bit vector of [4 x float] containing one of the source operands.
53 ///    The lower 32 bits of this operand are used in the calculation.
54 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
55 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
56 ///    the upper 96 bits of the first source operand.
57 static __inline__ __m128 __DEFAULT_FN_ATTRS
58 _mm_add_ss(__m128 __a, __m128 __b)
59 {
60   __a[0] += __b[0];
61   return __a;
62 }
63 
64 /// Adds two 128-bit vectors of [4 x float], and returns the results of
65 ///    the addition.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
70 ///
71 /// \param __a
72 ///    A 128-bit vector of [4 x float] containing one of the source operands.
73 /// \param __b
74 ///    A 128-bit vector of [4 x float] containing one of the source operands.
75 /// \returns A 128-bit vector of [4 x float] containing the sums of both
76 ///    operands.
77 static __inline__ __m128 __DEFAULT_FN_ATTRS
78 _mm_add_ps(__m128 __a, __m128 __b)
79 {
80   return (__m128)((__v4sf)__a + (__v4sf)__b);
81 }
82 
83 /// Subtracts the 32-bit float value in the low-order bits of the second
84 ///    operand from the corresponding value in the first operand.
85 ///
86 /// \headerfile <x86intrin.h>
87 ///
88 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
89 ///
90 /// \param __a
91 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
92 ///    of this operand are used in the calculation.
93 /// \param __b
94 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
95 ///    bits of this operand are used in the calculation.
96 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
97 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
98 ///    copied from the upper 96 bits of the first source operand.
99 static __inline__ __m128 __DEFAULT_FN_ATTRS
100 _mm_sub_ss(__m128 __a, __m128 __b)
101 {
102   __a[0] -= __b[0];
103   return __a;
104 }
105 
106 /// Subtracts each of the values of the second operand from the first
107 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
108 ///    the results of the subtraction.
109 ///
110 /// \headerfile <x86intrin.h>
111 ///
112 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
113 ///
114 /// \param __a
115 ///    A 128-bit vector of [4 x float] containing the minuend.
116 /// \param __b
117 ///    A 128-bit vector of [4 x float] containing the subtrahend.
118 /// \returns A 128-bit vector of [4 x float] containing the differences between
119 ///    both operands.
120 static __inline__ __m128 __DEFAULT_FN_ATTRS
121 _mm_sub_ps(__m128 __a, __m128 __b)
122 {
123   return (__m128)((__v4sf)__a - (__v4sf)__b);
124 }
125 
126 /// Multiplies two 32-bit float values in the low-order bits of the
127 ///    operands.
128 ///
129 /// \headerfile <x86intrin.h>
130 ///
131 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
132 ///
133 /// \param __a
134 ///    A 128-bit vector of [4 x float] containing one of the source operands.
135 ///    The lower 32 bits of this operand are used in the calculation.
136 /// \param __b
137 ///    A 128-bit vector of [4 x float] containing one of the source operands.
138 ///    The lower 32 bits of this operand are used in the calculation.
139 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
140 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
141 ///    bits of the first source operand.
142 static __inline__ __m128 __DEFAULT_FN_ATTRS
143 _mm_mul_ss(__m128 __a, __m128 __b)
144 {
145   __a[0] *= __b[0];
146   return __a;
147 }
148 
149 /// Multiplies two 128-bit vectors of [4 x float] and returns the
150 ///    results of the multiplication.
151 ///
152 /// \headerfile <x86intrin.h>
153 ///
154 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
155 ///
156 /// \param __a
157 ///    A 128-bit vector of [4 x float] containing one of the source operands.
158 /// \param __b
159 ///    A 128-bit vector of [4 x float] containing one of the source operands.
160 /// \returns A 128-bit vector of [4 x float] containing the products of both
161 ///    operands.
162 static __inline__ __m128 __DEFAULT_FN_ATTRS
163 _mm_mul_ps(__m128 __a, __m128 __b)
164 {
165   return (__m128)((__v4sf)__a * (__v4sf)__b);
166 }
167 
168 /// Divides the value in the low-order 32 bits of the first operand by
169 ///    the corresponding value in the second operand.
170 ///
171 /// \headerfile <x86intrin.h>
172 ///
173 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
174 ///
175 /// \param __a
176 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
177 ///    bits of this operand are used in the calculation.
178 /// \param __b
179 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
180 ///    of this operand are used in the calculation.
181 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
182 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
183 ///    upper 96 bits of the first source operand.
184 static __inline__ __m128 __DEFAULT_FN_ATTRS
185 _mm_div_ss(__m128 __a, __m128 __b)
186 {
187   __a[0] /= __b[0];
188   return __a;
189 }
190 
191 /// Divides two 128-bit vectors of [4 x float].
192 ///
193 /// \headerfile <x86intrin.h>
194 ///
195 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
196 ///
197 /// \param __a
198 ///    A 128-bit vector of [4 x float] containing the dividend.
199 /// \param __b
200 ///    A 128-bit vector of [4 x float] containing the divisor.
201 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
202 ///    operands.
203 static __inline__ __m128 __DEFAULT_FN_ATTRS
204 _mm_div_ps(__m128 __a, __m128 __b)
205 {
206   return (__m128)((__v4sf)__a / (__v4sf)__b);
207 }
208 
209 /// Calculates the square root of the value stored in the low-order bits
210 ///    of a 128-bit vector of [4 x float].
211 ///
212 /// \headerfile <x86intrin.h>
213 ///
214 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
215 ///
216 /// \param __a
217 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
218 ///    used in the calculation.
219 /// \returns A 128-bit vector of [4 x float] containing the square root of the
220 ///    value in the low-order bits of the operand.
221 static __inline__ __m128 __DEFAULT_FN_ATTRS
222 _mm_sqrt_ss(__m128 __a)
223 {
224   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
225 }
226 
227 /// Calculates the square roots of the values stored in a 128-bit vector
228 ///    of [4 x float].
229 ///
230 /// \headerfile <x86intrin.h>
231 ///
232 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
233 ///
234 /// \param __a
235 ///    A 128-bit vector of [4 x float].
236 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
237 ///    values in the operand.
238 static __inline__ __m128 __DEFAULT_FN_ATTRS
239 _mm_sqrt_ps(__m128 __a)
240 {
241   return __builtin_ia32_sqrtps((__v4sf)__a);
242 }
243 
244 /// Calculates the approximate reciprocal of the value stored in the
245 ///    low-order bits of a 128-bit vector of [4 x float].
246 ///
247 /// \headerfile <x86intrin.h>
248 ///
249 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
250 ///
251 /// \param __a
252 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
253 ///    used in the calculation.
254 /// \returns A 128-bit vector of [4 x float] containing the approximate
255 ///    reciprocal of the value in the low-order bits of the operand.
256 static __inline__ __m128 __DEFAULT_FN_ATTRS
257 _mm_rcp_ss(__m128 __a)
258 {
259   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
260 }
261 
262 /// Calculates the approximate reciprocals of the values stored in a
263 ///    128-bit vector of [4 x float].
264 ///
265 /// \headerfile <x86intrin.h>
266 ///
267 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
268 ///
269 /// \param __a
270 ///    A 128-bit vector of [4 x float].
271 /// \returns A 128-bit vector of [4 x float] containing the approximate
272 ///    reciprocals of the values in the operand.
273 static __inline__ __m128 __DEFAULT_FN_ATTRS
274 _mm_rcp_ps(__m128 __a)
275 {
276   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
277 }
278 
279 /// Calculates the approximate reciprocal of the square root of the value
280 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
281 ///
282 /// \headerfile <x86intrin.h>
283 ///
284 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
285 ///
286 /// \param __a
287 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
288 ///    used in the calculation.
289 /// \returns A 128-bit vector of [4 x float] containing the approximate
290 ///    reciprocal of the square root of the value in the low-order bits of the
291 ///    operand.
292 static __inline__ __m128 __DEFAULT_FN_ATTRS
293 _mm_rsqrt_ss(__m128 __a)
294 {
295   return __builtin_ia32_rsqrtss((__v4sf)__a);
296 }
297 
298 /// Calculates the approximate reciprocals of the square roots of the
299 ///    values stored in a 128-bit vector of [4 x float].
300 ///
301 /// \headerfile <x86intrin.h>
302 ///
303 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
304 ///
305 /// \param __a
306 ///    A 128-bit vector of [4 x float].
307 /// \returns A 128-bit vector of [4 x float] containing the approximate
308 ///    reciprocals of the square roots of the values in the operand.
309 static __inline__ __m128 __DEFAULT_FN_ATTRS
310 _mm_rsqrt_ps(__m128 __a)
311 {
312   return __builtin_ia32_rsqrtps((__v4sf)__a);
313 }
314 
315 /// Compares two 32-bit float values in the low-order bits of both
316 ///    operands and returns the lesser value in the low-order bits of the
317 ///    vector of [4 x float].
318 ///
319 ///    If either value in a comparison is NaN, returns the value from \a __b.
320 ///
321 /// \headerfile <x86intrin.h>
322 ///
323 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
324 ///
325 /// \param __a
326 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
327 ///    32 bits of this operand are used in the comparison.
328 /// \param __b
329 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
330 ///    32 bits of this operand are used in the comparison.
331 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
332 ///    minimum value between both operands. The upper 96 bits are copied from
333 ///    the upper 96 bits of the first source operand.
334 static __inline__ __m128 __DEFAULT_FN_ATTRS
335 _mm_min_ss(__m128 __a, __m128 __b)
336 {
337   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
338 }
339 
340 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
341 ///    of each pair of values.
342 ///
343 ///    If either value in a comparison is NaN, returns the value from \a __b.
344 ///
345 /// \headerfile <x86intrin.h>
346 ///
347 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
348 ///
349 /// \param __a
350 ///    A 128-bit vector of [4 x float] containing one of the operands.
351 /// \param __b
352 ///    A 128-bit vector of [4 x float] containing one of the operands.
353 /// \returns A 128-bit vector of [4 x float] containing the minimum values
354 ///    between both operands.
355 static __inline__ __m128 __DEFAULT_FN_ATTRS
356 _mm_min_ps(__m128 __a, __m128 __b)
357 {
358   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
359 }
360 
361 /// Compares two 32-bit float values in the low-order bits of both
362 ///    operands and returns the greater value in the low-order bits of a 128-bit
363 ///    vector of [4 x float].
364 ///
365 ///    If either value in a comparison is NaN, returns the value from \a __b.
366 ///
367 /// \headerfile <x86intrin.h>
368 ///
369 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
370 ///
371 /// \param __a
372 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
373 ///    32 bits of this operand are used in the comparison.
374 /// \param __b
375 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
376 ///    32 bits of this operand are used in the comparison.
377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378 ///    maximum value between both operands. The upper 96 bits are copied from
379 ///    the upper 96 bits of the first source operand.
380 static __inline__ __m128 __DEFAULT_FN_ATTRS
381 _mm_max_ss(__m128 __a, __m128 __b)
382 {
383   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
384 }
385 
386 /// Compares two 128-bit vectors of [4 x float] and returns the greater
387 ///    of each pair of values.
388 ///
389 ///    If either value in a comparison is NaN, returns the value from \a __b.
390 ///
391 /// \headerfile <x86intrin.h>
392 ///
393 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
394 ///
395 /// \param __a
396 ///    A 128-bit vector of [4 x float] containing one of the operands.
397 /// \param __b
398 ///    A 128-bit vector of [4 x float] containing one of the operands.
399 /// \returns A 128-bit vector of [4 x float] containing the maximum values
400 ///    between both operands.
401 static __inline__ __m128 __DEFAULT_FN_ATTRS
402 _mm_max_ps(__m128 __a, __m128 __b)
403 {
404   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
405 }
406 
407 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
408 ///
409 /// \headerfile <x86intrin.h>
410 ///
411 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
412 ///
413 /// \param __a
414 ///    A 128-bit vector containing one of the source operands.
415 /// \param __b
416 ///    A 128-bit vector containing one of the source operands.
417 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
418 ///    values between both operands.
419 static __inline__ __m128 __DEFAULT_FN_ATTRS
420 _mm_and_ps(__m128 __a, __m128 __b)
421 {
422   return (__m128)((__v4su)__a & (__v4su)__b);
423 }
424 
425 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
426 ///    the one's complement of the values contained in the first source
427 ///    operand.
428 ///
429 /// \headerfile <x86intrin.h>
430 ///
431 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
432 ///
433 /// \param __a
434 ///    A 128-bit vector of [4 x float] containing the first source operand. The
435 ///    one's complement of this value is used in the bitwise AND.
436 /// \param __b
437 ///    A 128-bit vector of [4 x float] containing the second source operand.
438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439 ///    one's complement of the first operand and the values in the second
440 ///    operand.
441 static __inline__ __m128 __DEFAULT_FN_ATTRS
442 _mm_andnot_ps(__m128 __a, __m128 __b)
443 {
444   return (__m128)(~(__v4su)__a & (__v4su)__b);
445 }
446 
447 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
448 ///
449 /// \headerfile <x86intrin.h>
450 ///
451 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
452 ///
453 /// \param __a
454 ///    A 128-bit vector of [4 x float] containing one of the source operands.
455 /// \param __b
456 ///    A 128-bit vector of [4 x float] containing one of the source operands.
457 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
458 ///    values between both operands.
459 static __inline__ __m128 __DEFAULT_FN_ATTRS
460 _mm_or_ps(__m128 __a, __m128 __b)
461 {
462   return (__m128)((__v4su)__a | (__v4su)__b);
463 }
464 
465 /// Performs a bitwise exclusive OR of two 128-bit vectors of
466 ///    [4 x float].
467 ///
468 /// \headerfile <x86intrin.h>
469 ///
470 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
471 ///
472 /// \param __a
473 ///    A 128-bit vector of [4 x float] containing one of the source operands.
474 /// \param __b
475 ///    A 128-bit vector of [4 x float] containing one of the source operands.
476 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
477 ///    of the values between both operands.
478 static __inline__ __m128 __DEFAULT_FN_ATTRS
479 _mm_xor_ps(__m128 __a, __m128 __b)
480 {
481   return (__m128)((__v4su)__a ^ (__v4su)__b);
482 }
483 
484 /// Compares two 32-bit float values in the low-order bits of both
485 ///    operands for equality.
486 ///
487 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
488 ///    low-order bits of a vector [4 x float].
489 ///    If either value in a comparison is NaN, returns false.
490 ///
491 /// \headerfile <x86intrin.h>
492 ///
493 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
494 ///
495 /// \param __a
496 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
497 ///    32 bits of this operand are used in the comparison.
498 /// \param __b
499 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
500 ///    32 bits of this operand are used in the comparison.
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results
502 ///    in the low-order bits.
503 static __inline__ __m128 __DEFAULT_FN_ATTRS
504 _mm_cmpeq_ss(__m128 __a, __m128 __b)
505 {
506   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
507 }
508 
509 /// Compares each of the corresponding 32-bit float values of the
510 ///    128-bit vectors of [4 x float] for equality.
511 ///
512 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
513 ///    If either value in a comparison is NaN, returns false.
514 ///
515 /// \headerfile <x86intrin.h>
516 ///
517 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
518 ///
519 /// \param __a
520 ///    A 128-bit vector of [4 x float].
521 /// \param __b
522 ///    A 128-bit vector of [4 x float].
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
524 static __inline__ __m128 __DEFAULT_FN_ATTRS
525 _mm_cmpeq_ps(__m128 __a, __m128 __b)
526 {
527   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
528 }
529 
530 /// Compares two 32-bit float values in the low-order bits of both
531 ///    operands to determine if the value in the first operand is less than the
532 ///    corresponding value in the second operand.
533 ///
534 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
535 ///    low-order bits of a vector of [4 x float].
536 ///    If either value in a comparison is NaN, returns false.
537 ///
538 /// \headerfile <x86intrin.h>
539 ///
540 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
541 ///
542 /// \param __a
543 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
544 ///    32 bits of this operand are used in the comparison.
545 /// \param __b
546 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
547 ///    32 bits of this operand are used in the comparison.
548 /// \returns A 128-bit vector of [4 x float] containing the comparison results
549 ///    in the low-order bits.
550 static __inline__ __m128 __DEFAULT_FN_ATTRS
551 _mm_cmplt_ss(__m128 __a, __m128 __b)
552 {
553   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
554 }
555 
556 /// Compares each of the corresponding 32-bit float values of the
557 ///    128-bit vectors of [4 x float] to determine if the values in the first
558 ///    operand are less than those in the second operand.
559 ///
560 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
561 ///    If either value in a comparison is NaN, returns false.
562 ///
563 /// \headerfile <x86intrin.h>
564 ///
565 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
566 ///
567 /// \param __a
568 ///    A 128-bit vector of [4 x float].
569 /// \param __b
570 ///    A 128-bit vector of [4 x float].
571 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
572 static __inline__ __m128 __DEFAULT_FN_ATTRS
573 _mm_cmplt_ps(__m128 __a, __m128 __b)
574 {
575   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
576 }
577 
578 /// Compares two 32-bit float values in the low-order bits of both
579 ///    operands to determine if the value in the first operand is less than or
580 ///    equal to the corresponding value in the second operand.
581 ///
582 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
583 ///    the low-order bits of a vector of [4 x float].
584 ///    If either value in a comparison is NaN, returns false.
585 ///
586 /// \headerfile <x86intrin.h>
587 ///
588 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
589 ///
590 /// \param __a
591 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
592 ///    32 bits of this operand are used in the comparison.
593 /// \param __b
594 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
595 ///    32 bits of this operand are used in the comparison.
596 /// \returns A 128-bit vector of [4 x float] containing the comparison results
597 ///    in the low-order bits.
598 static __inline__ __m128 __DEFAULT_FN_ATTRS
599 _mm_cmple_ss(__m128 __a, __m128 __b)
600 {
601   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
602 }
603 
604 /// Compares each of the corresponding 32-bit float values of the
605 ///    128-bit vectors of [4 x float] to determine if the values in the first
606 ///    operand are less than or equal to those in the second operand.
607 ///
608 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
609 ///    If either value in a comparison is NaN, returns false.
610 ///
611 /// \headerfile <x86intrin.h>
612 ///
613 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
614 ///
615 /// \param __a
616 ///    A 128-bit vector of [4 x float].
617 /// \param __b
618 ///    A 128-bit vector of [4 x float].
619 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
620 static __inline__ __m128 __DEFAULT_FN_ATTRS
621 _mm_cmple_ps(__m128 __a, __m128 __b)
622 {
623   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
624 }
625 
626 /// Compares two 32-bit float values in the low-order bits of both
627 ///    operands to determine if the value in the first operand is greater than
628 ///    the corresponding value in the second operand.
629 ///
630 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
631 ///    low-order bits of a vector of [4 x float].
632 ///    If either value in a comparison is NaN, returns false.
633 ///
634 /// \headerfile <x86intrin.h>
635 ///
636 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
637 ///
638 /// \param __a
639 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
640 ///    32 bits of this operand are used in the comparison.
641 /// \param __b
642 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
643 ///    32 bits of this operand are used in the comparison.
644 /// \returns A 128-bit vector of [4 x float] containing the comparison results
645 ///    in the low-order bits.
646 static __inline__ __m128 __DEFAULT_FN_ATTRS
647 _mm_cmpgt_ss(__m128 __a, __m128 __b)
648 {
649   return (__m128)__builtin_shufflevector((__v4sf)__a,
650                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
651                                          4, 1, 2, 3);
652 }
653 
654 /// Compares each of the corresponding 32-bit float values of the
655 ///    128-bit vectors of [4 x float] to determine if the values in the first
656 ///    operand are greater than those in the second operand.
657 ///
658 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
659 ///    If either value in a comparison is NaN, returns false.
660 ///
661 /// \headerfile <x86intrin.h>
662 ///
663 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
664 ///
665 /// \param __a
666 ///    A 128-bit vector of [4 x float].
667 /// \param __b
668 ///    A 128-bit vector of [4 x float].
669 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
670 static __inline__ __m128 __DEFAULT_FN_ATTRS
671 _mm_cmpgt_ps(__m128 __a, __m128 __b)
672 {
673   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
674 }
675 
676 /// Compares two 32-bit float values in the low-order bits of both
677 ///    operands to determine if the value in the first operand is greater than
678 ///    or equal to the corresponding value in the second operand.
679 ///
680 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
681 ///    low-order bits of a vector of [4 x float].
682 ///    If either value in a comparison is NaN, returns false.
683 ///
684 /// \headerfile <x86intrin.h>
685 ///
686 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
687 ///
688 /// \param __a
689 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
690 ///    32 bits of this operand are used in the comparison.
691 /// \param __b
692 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
693 ///    32 bits of this operand are used in the comparison.
694 /// \returns A 128-bit vector of [4 x float] containing the comparison results
695 ///    in the low-order bits.
696 static __inline__ __m128 __DEFAULT_FN_ATTRS
697 _mm_cmpge_ss(__m128 __a, __m128 __b)
698 {
699   return (__m128)__builtin_shufflevector((__v4sf)__a,
700                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
701                                          4, 1, 2, 3);
702 }
703 
704 /// Compares each of the corresponding 32-bit float values of the
705 ///    128-bit vectors of [4 x float] to determine if the values in the first
706 ///    operand are greater than or equal to those in the second operand.
707 ///
708 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
709 ///    If either value in a comparison is NaN, returns false.
710 ///
711 /// \headerfile <x86intrin.h>
712 ///
713 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
714 ///
715 /// \param __a
716 ///    A 128-bit vector of [4 x float].
717 /// \param __b
718 ///    A 128-bit vector of [4 x float].
719 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
720 static __inline__ __m128 __DEFAULT_FN_ATTRS
721 _mm_cmpge_ps(__m128 __a, __m128 __b)
722 {
723   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
724 }
725 
726 /// Compares two 32-bit float values in the low-order bits of both operands
727 ///    for inequality.
728 ///
729 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
730 ///    low-order bits of a vector of [4 x float].
731 ///    If either value in a comparison is NaN, returns true.
732 ///
733 /// \headerfile <x86intrin.h>
734 ///
735 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
736 ///   instructions.
737 ///
738 /// \param __a
739 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
740 ///    32 bits of this operand are used in the comparison.
741 /// \param __b
742 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
743 ///    32 bits of this operand are used in the comparison.
744 /// \returns A 128-bit vector of [4 x float] containing the comparison results
745 ///    in the low-order bits.
746 static __inline__ __m128 __DEFAULT_FN_ATTRS
747 _mm_cmpneq_ss(__m128 __a, __m128 __b)
748 {
749   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
750 }
751 
752 /// Compares each of the corresponding 32-bit float values of the
753 ///    128-bit vectors of [4 x float] for inequality.
754 ///
755 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
756 ///    If either value in a comparison is NaN, returns true.
757 ///
758 /// \headerfile <x86intrin.h>
759 ///
760 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
761 ///   instructions.
762 ///
763 /// \param __a
764 ///    A 128-bit vector of [4 x float].
765 /// \param __b
766 ///    A 128-bit vector of [4 x float].
767 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
768 static __inline__ __m128 __DEFAULT_FN_ATTRS
769 _mm_cmpneq_ps(__m128 __a, __m128 __b)
770 {
771   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
772 }
773 
774 /// Compares two 32-bit float values in the low-order bits of both
775 ///    operands to determine if the value in the first operand is not less than
776 ///    the corresponding value in the second operand.
777 ///
778 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
779 ///    low-order bits of a vector of [4 x float].
780 ///    If either value in a comparison is NaN, returns true.
781 ///
782 /// \headerfile <x86intrin.h>
783 ///
784 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
785 ///   instructions.
786 ///
787 /// \param __a
788 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
789 ///    32 bits of this operand are used in the comparison.
790 /// \param __b
791 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
792 ///    32 bits of this operand are used in the comparison.
793 /// \returns A 128-bit vector of [4 x float] containing the comparison results
794 ///    in the low-order bits.
795 static __inline__ __m128 __DEFAULT_FN_ATTRS
796 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
797 {
798   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
799 }
800 
801 /// Compares each of the corresponding 32-bit float values of the
802 ///    128-bit vectors of [4 x float] to determine if the values in the first
803 ///    operand are not less than those in the second operand.
804 ///
805 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
806 ///    If either value in a comparison is NaN, returns true.
807 ///
808 /// \headerfile <x86intrin.h>
809 ///
810 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
811 ///   instructions.
812 ///
813 /// \param __a
814 ///    A 128-bit vector of [4 x float].
815 /// \param __b
816 ///    A 128-bit vector of [4 x float].
817 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
818 static __inline__ __m128 __DEFAULT_FN_ATTRS
819 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
820 {
821   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
822 }
823 
824 /// Compares two 32-bit float values in the low-order bits of both
825 ///    operands to determine if the value in the first operand is not less than
826 ///    or equal to the corresponding value in the second operand.
827 ///
828 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
829 ///    low-order bits of a vector of [4 x float].
830 ///    If either value in a comparison is NaN, returns true.
831 ///
832 /// \headerfile <x86intrin.h>
833 ///
834 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
835 ///   instructions.
836 ///
837 /// \param __a
838 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
839 ///    32 bits of this operand are used in the comparison.
840 /// \param __b
841 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
842 ///    32 bits of this operand are used in the comparison.
843 /// \returns A 128-bit vector of [4 x float] containing the comparison results
844 ///    in the low-order bits.
845 static __inline__ __m128 __DEFAULT_FN_ATTRS
846 _mm_cmpnle_ss(__m128 __a, __m128 __b)
847 {
848   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
849 }
850 
851 /// Compares each of the corresponding 32-bit float values of the
852 ///    128-bit vectors of [4 x float] to determine if the values in the first
853 ///    operand are not less than or equal to those in the second operand.
854 ///
855 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
856 ///    If either value in a comparison is NaN, returns true.
857 ///
858 /// \headerfile <x86intrin.h>
859 ///
860 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
861 ///   instructions.
862 ///
863 /// \param __a
864 ///    A 128-bit vector of [4 x float].
865 /// \param __b
866 ///    A 128-bit vector of [4 x float].
867 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
868 static __inline__ __m128 __DEFAULT_FN_ATTRS
869 _mm_cmpnle_ps(__m128 __a, __m128 __b)
870 {
871   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
872 }
873 
874 /// Compares two 32-bit float values in the low-order bits of both
875 ///    operands to determine if the value in the first operand is not greater
876 ///    than the corresponding value in the second operand.
877 ///
878 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
879 ///    low-order bits of a vector of [4 x float].
880 ///    If either value in a comparison is NaN, returns true.
881 ///
882 /// \headerfile <x86intrin.h>
883 ///
884 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
885 ///   instructions.
886 ///
887 /// \param __a
888 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
889 ///    32 bits of this operand are used in the comparison.
890 /// \param __b
891 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
892 ///    32 bits of this operand are used in the comparison.
893 /// \returns A 128-bit vector of [4 x float] containing the comparison results
894 ///    in the low-order bits.
895 static __inline__ __m128 __DEFAULT_FN_ATTRS
896 _mm_cmpngt_ss(__m128 __a, __m128 __b)
897 {
898   return (__m128)__builtin_shufflevector((__v4sf)__a,
899                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
900                                          4, 1, 2, 3);
901 }
902 
903 /// Compares each of the corresponding 32-bit float values of the
904 ///    128-bit vectors of [4 x float] to determine if the values in the first
905 ///    operand are not greater than those in the second operand.
906 ///
907 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
908 ///    If either value in a comparison is NaN, returns true.
909 ///
910 /// \headerfile <x86intrin.h>
911 ///
912 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
913 ///   instructions.
914 ///
915 /// \param __a
916 ///    A 128-bit vector of [4 x float].
917 /// \param __b
918 ///    A 128-bit vector of [4 x float].
919 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
920 static __inline__ __m128 __DEFAULT_FN_ATTRS
921 _mm_cmpngt_ps(__m128 __a, __m128 __b)
922 {
923   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
924 }
925 
926 /// Compares two 32-bit float values in the low-order bits of both
927 ///    operands to determine if the value in the first operand is not greater
928 ///    than or equal to the corresponding value in the second operand.
929 ///
930 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
931 ///    low-order bits of a vector of [4 x float].
932 ///    If either value in a comparison is NaN, returns true.
933 ///
934 /// \headerfile <x86intrin.h>
935 ///
936 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
937 ///   instructions.
938 ///
939 /// \param __a
940 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
941 ///    32 bits of this operand are used in the comparison.
942 /// \param __b
943 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
944 ///    32 bits of this operand are used in the comparison.
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results
946 ///    in the low-order bits.
947 static __inline__ __m128 __DEFAULT_FN_ATTRS
948 _mm_cmpnge_ss(__m128 __a, __m128 __b)
949 {
950   return (__m128)__builtin_shufflevector((__v4sf)__a,
951                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
952                                          4, 1, 2, 3);
953 }
954 
955 /// Compares each of the corresponding 32-bit float values of the
956 ///    128-bit vectors of [4 x float] to determine if the values in the first
957 ///    operand are not greater than or equal to those in the second operand.
958 ///
959 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
960 ///    If either value in a comparison is NaN, returns true.
961 ///
962 /// \headerfile <x86intrin.h>
963 ///
964 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
965 ///   instructions.
966 ///
967 /// \param __a
968 ///    A 128-bit vector of [4 x float].
969 /// \param __b
970 ///    A 128-bit vector of [4 x float].
971 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
972 static __inline__ __m128 __DEFAULT_FN_ATTRS
973 _mm_cmpnge_ps(__m128 __a, __m128 __b)
974 {
975   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
976 }
977 
978 /// Compares two 32-bit float values in the low-order bits of both
979 ///    operands to determine if the value in the first operand is ordered with
980 ///    respect to the corresponding value in the second operand.
981 ///
982 ///    A pair of floating-point values are ordered with respect to each
983 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
984 ///    0xFFFFFFFF for true.
985 ///
986 /// \headerfile <x86intrin.h>
987 ///
988 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
989 ///   instructions.
990 ///
991 /// \param __a
992 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
993 ///    32 bits of this operand are used in the comparison.
994 /// \param __b
995 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
996 ///    32 bits of this operand are used in the comparison.
997 /// \returns A 128-bit vector of [4 x float] containing the comparison results
998 ///    in the low-order bits.
999 static __inline__ __m128 __DEFAULT_FN_ATTRS
1000 _mm_cmpord_ss(__m128 __a, __m128 __b)
1001 {
1002   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1003 }
1004 
1005 /// Compares each of the corresponding 32-bit float values of the
1006 ///    128-bit vectors of [4 x float] to determine if the values in the first
1007 ///    operand are ordered with respect to those in the second operand.
1008 ///
1009 ///    A pair of floating-point values are ordered with respect to each
1010 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1011 ///    0xFFFFFFFF for true.
1012 ///
1013 /// \headerfile <x86intrin.h>
1014 ///
1015 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1016 ///   instructions.
1017 ///
1018 /// \param __a
1019 ///    A 128-bit vector of [4 x float].
1020 /// \param __b
1021 ///    A 128-bit vector of [4 x float].
1022 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1023 static __inline__ __m128 __DEFAULT_FN_ATTRS
1024 _mm_cmpord_ps(__m128 __a, __m128 __b)
1025 {
1026   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1027 }
1028 
1029 /// Compares two 32-bit float values in the low-order bits of both
1030 ///    operands to determine if the value in the first operand is unordered
1031 ///    with respect to the corresponding value in the second operand.
1032 ///
1033 ///    A pair of double-precision values are unordered with respect to each
1034 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1035 ///    false, 0xFFFFFFFF for true.
1036 ///
1037 /// \headerfile <x86intrin.h>
1038 ///
1039 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1040 ///   instructions.
1041 ///
1042 /// \param __a
1043 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1044 ///    32 bits of this operand are used in the comparison.
1045 /// \param __b
1046 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1047 ///    32 bits of this operand are used in the comparison.
1048 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1049 ///    in the low-order bits.
1050 static __inline__ __m128 __DEFAULT_FN_ATTRS
1051 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1052 {
1053   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1054 }
1055 
1056 /// Compares each of the corresponding 32-bit float values of the
1057 ///    128-bit vectors of [4 x float] to determine if the values in the first
1058 ///    operand are unordered with respect to those in the second operand.
1059 ///
1060 ///    A pair of double-precision values are unordered with respect to each
1061 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1062 ///    false, 0xFFFFFFFFFFFFFFFF for true.
1063 ///
1064 /// \headerfile <x86intrin.h>
1065 ///
1066 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1067 ///   instructions.
1068 ///
1069 /// \param __a
1070 ///    A 128-bit vector of [4 x float].
1071 /// \param __b
1072 ///    A 128-bit vector of [4 x float].
1073 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1074 static __inline__ __m128 __DEFAULT_FN_ATTRS
1075 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1076 {
1077   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1078 }
1079 
1080 /// Compares two 32-bit float values in the low-order bits of both
1081 ///    operands for equality.
1082 ///
1083 ///    The comparison returns 0 for false, 1 for true. If either value in a
1084 ///    comparison is NaN, returns 0.
1085 ///
1086 /// \headerfile <x86intrin.h>
1087 ///
1088 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1089 ///   instructions.
1090 ///
1091 /// \param __a
1092 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093 ///    used in the comparison.
1094 /// \param __b
1095 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1096 ///    used in the comparison.
1097 /// \returns An integer containing the comparison results.
1098 static __inline__ int __DEFAULT_FN_ATTRS
1099 _mm_comieq_ss(__m128 __a, __m128 __b)
1100 {
1101   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1102 }
1103 
1104 /// Compares two 32-bit float values in the low-order bits of both
1105 ///    operands to determine if the first operand is less than the second
1106 ///    operand.
1107 ///
1108 ///    The comparison returns 0 for false, 1 for true. If either value in a
1109 ///    comparison is NaN, returns 0.
1110 ///
1111 /// \headerfile <x86intrin.h>
1112 ///
1113 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1114 ///   instructions.
1115 ///
1116 /// \param __a
1117 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1118 ///    used in the comparison.
1119 /// \param __b
1120 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1121 ///    used in the comparison.
1122 /// \returns An integer containing the comparison results.
1123 static __inline__ int __DEFAULT_FN_ATTRS
1124 _mm_comilt_ss(__m128 __a, __m128 __b)
1125 {
1126   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1127 }
1128 
1129 /// Compares two 32-bit float values in the low-order bits of both
1130 ///    operands to determine if the first operand is less than or equal to the
1131 ///    second operand.
1132 ///
1133 ///    The comparison returns 0 for false, 1 for true. If either value in a
1134 ///    comparison is NaN, returns 0.
1135 ///
1136 /// \headerfile <x86intrin.h>
1137 ///
1138 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1139 ///
1140 /// \param __a
1141 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142 ///    used in the comparison.
1143 /// \param __b
1144 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1145 ///    used in the comparison.
1146 /// \returns An integer containing the comparison results.
1147 static __inline__ int __DEFAULT_FN_ATTRS
1148 _mm_comile_ss(__m128 __a, __m128 __b)
1149 {
1150   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1151 }
1152 
1153 /// Compares two 32-bit float values in the low-order bits of both
1154 ///    operands to determine if the first operand is greater than the second
1155 ///    operand.
1156 ///
1157 ///    The comparison returns 0 for false, 1 for true. If either value in a
1158 ///    comparison is NaN, returns 0.
1159 ///
1160 /// \headerfile <x86intrin.h>
1161 ///
1162 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1163 ///
1164 /// \param __a
1165 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166 ///    used in the comparison.
1167 /// \param __b
1168 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169 ///    used in the comparison.
1170 /// \returns An integer containing the comparison results.
1171 static __inline__ int __DEFAULT_FN_ATTRS
1172 _mm_comigt_ss(__m128 __a, __m128 __b)
1173 {
1174   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1175 }
1176 
1177 /// Compares two 32-bit float values in the low-order bits of both
1178 ///    operands to determine if the first operand is greater than or equal to
1179 ///    the second operand.
1180 ///
1181 ///    The comparison returns 0 for false, 1 for true. If either value in a
1182 ///    comparison is NaN, returns 0.
1183 ///
1184 /// \headerfile <x86intrin.h>
1185 ///
1186 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1187 ///
1188 /// \param __a
1189 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190 ///    used in the comparison.
1191 /// \param __b
1192 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1193 ///    used in the comparison.
1194 /// \returns An integer containing the comparison results.
1195 static __inline__ int __DEFAULT_FN_ATTRS
1196 _mm_comige_ss(__m128 __a, __m128 __b)
1197 {
1198   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1199 }
1200 
1201 /// Compares two 32-bit float values in the low-order bits of both
1202 ///    operands to determine if the first operand is not equal to the second
1203 ///    operand.
1204 ///
1205 ///    The comparison returns 0 for false, 1 for true. If either value in a
1206 ///    comparison is NaN, returns 1.
1207 ///
1208 /// \headerfile <x86intrin.h>
1209 ///
1210 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1211 ///
1212 /// \param __a
1213 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214 ///    used in the comparison.
1215 /// \param __b
1216 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1217 ///    used in the comparison.
1218 /// \returns An integer containing the comparison results.
1219 static __inline__ int __DEFAULT_FN_ATTRS
1220 _mm_comineq_ss(__m128 __a, __m128 __b)
1221 {
1222   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1223 }
1224 
1225 /// Performs an unordered comparison of two 32-bit float values using
1226 ///    the low-order bits of both operands to determine equality.
1227 ///
1228 ///    The comparison returns 0 for false, 1 for true. If either value in a
1229 ///    comparison is NaN, returns 0.
1230 ///
1231 /// \headerfile <x86intrin.h>
1232 ///
1233 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1234 ///
1235 /// \param __a
1236 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1237 ///    used in the comparison.
1238 /// \param __b
1239 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240 ///    used in the comparison.
1241 /// \returns An integer containing the comparison results.
1242 static __inline__ int __DEFAULT_FN_ATTRS
1243 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1244 {
1245   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1246 }
1247 
1248 /// Performs an unordered comparison of two 32-bit float values using
1249 ///    the low-order bits of both operands to determine if the first operand is
1250 ///    less than the second operand.
1251 ///
1252 ///    The comparison returns 0 for false, 1 for true. If either value in a
1253 ///    comparison is NaN, returns 0.
1254 ///
1255 /// \headerfile <x86intrin.h>
1256 ///
1257 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1258 ///
1259 /// \param __a
1260 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261 ///    used in the comparison.
1262 /// \param __b
1263 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1264 ///    used in the comparison.
1265 /// \returns An integer containing the comparison results.
1266 static __inline__ int __DEFAULT_FN_ATTRS
1267 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1268 {
1269   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1270 }
1271 
1272 /// Performs an unordered comparison of two 32-bit float values using
1273 ///    the low-order bits of both operands to determine if the first operand is
1274 ///    less than or equal to the second operand.
1275 ///
1276 ///    The comparison returns 0 for false, 1 for true. If either value in a
1277 ///    comparison is NaN, returns 0.
1278 ///
1279 /// \headerfile <x86intrin.h>
1280 ///
1281 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1282 ///
1283 /// \param __a
1284 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285 ///    used in the comparison.
1286 /// \param __b
1287 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1288 ///    used in the comparison.
1289 /// \returns An integer containing the comparison results.
1290 static __inline__ int __DEFAULT_FN_ATTRS
1291 _mm_ucomile_ss(__m128 __a, __m128 __b)
1292 {
1293   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1294 }
1295 
1296 /// Performs an unordered comparison of two 32-bit float values using
1297 ///    the low-order bits of both operands to determine if the first operand is
1298 ///    greater than the second operand.
1299 ///
1300 ///    The comparison returns 0 for false, 1 for true. If either value in a
1301 ///    comparison is NaN, returns 0.
1302 ///
1303 /// \headerfile <x86intrin.h>
1304 ///
1305 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1306 ///
1307 /// \param __a
1308 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1309 ///    used in the comparison.
1310 /// \param __b
1311 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1312 ///    used in the comparison.
1313 /// \returns An integer containing the comparison results.
1314 static __inline__ int __DEFAULT_FN_ATTRS
1315 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1316 {
1317   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1318 }
1319 
1320 /// Performs an unordered comparison of two 32-bit float values using
1321 ///    the low-order bits of both operands to determine if the first operand is
1322 ///    greater than or equal to the second operand.
1323 ///
1324 ///    The comparison returns 0 for false, 1 for true. If either value in a
1325 ///    comparison is NaN, returns 0.
1326 ///
1327 /// \headerfile <x86intrin.h>
1328 ///
1329 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1330 ///
1331 /// \param __a
1332 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1333 ///    used in the comparison.
1334 /// \param __b
1335 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1336 ///    used in the comparison.
1337 /// \returns An integer containing the comparison results.
1338 static __inline__ int __DEFAULT_FN_ATTRS
1339 _mm_ucomige_ss(__m128 __a, __m128 __b)
1340 {
1341   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1342 }
1343 
1344 /// Performs an unordered comparison of two 32-bit float values using
1345 ///    the low-order bits of both operands to determine inequality.
1346 ///
1347 ///    The comparison returns 0 for false, 1 for true. If either value in a
1348 ///    comparison is NaN, returns 0.
1349 ///
1350 /// \headerfile <x86intrin.h>
1351 ///
1352 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1353 ///
1354 /// \param __a
1355 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1356 ///    used in the comparison.
1357 /// \param __b
1358 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1359 ///    used in the comparison.
1360 /// \returns An integer containing the comparison results.
1361 static __inline__ int __DEFAULT_FN_ATTRS
1362 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1363 {
1364   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1365 }
1366 
1367 /// Converts a float value contained in the lower 32 bits of a vector of
1368 ///    [4 x float] into a 32-bit integer.
1369 ///
1370 ///    If the converted value does not fit in a 32-bit integer, raises a
1371 ///    floating-point invalid exception. If the exception is masked, returns
1372 ///    the most negative integer.
1373 ///
1374 /// \headerfile <x86intrin.h>
1375 ///
1376 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1377 ///   instructions.
1378 ///
1379 /// \param __a
1380 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1381 ///    used in the conversion.
1382 /// \returns A 32-bit integer containing the converted value.
1383 static __inline__ int __DEFAULT_FN_ATTRS
1384 _mm_cvtss_si32(__m128 __a)
1385 {
1386   return __builtin_ia32_cvtss2si((__v4sf)__a);
1387 }
1388 
1389 /// Converts a float value contained in the lower 32 bits of a vector of
1390 ///    [4 x float] into a 32-bit integer.
1391 ///
1392 ///    If the converted value does not fit in a 32-bit integer, raises a
1393 ///    floating-point invalid exception. If the exception is masked, returns
1394 ///    the most negative integer.
1395 ///
1396 /// \headerfile <x86intrin.h>
1397 ///
1398 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1399 ///   instructions.
1400 ///
1401 /// \param __a
1402 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1403 ///    used in the conversion.
1404 /// \returns A 32-bit integer containing the converted value.
1405 static __inline__ int __DEFAULT_FN_ATTRS
1406 _mm_cvt_ss2si(__m128 __a)
1407 {
1408   return _mm_cvtss_si32(__a);
1409 }
1410 
1411 #ifdef __x86_64__
1412 
1413 /// Converts a float value contained in the lower 32 bits of a vector of
1414 ///    [4 x float] into a 64-bit integer.
1415 ///
1416 ///    If the converted value does not fit in a 32-bit integer, raises a
1417 ///    floating-point invalid exception. If the exception is masked, returns
1418 ///    the most negative integer.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1423 ///   instructions.
1424 ///
1425 /// \param __a
1426 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1427 ///    used in the conversion.
1428 /// \returns A 64-bit integer containing the converted value.
1429 static __inline__ long long __DEFAULT_FN_ATTRS
1430 _mm_cvtss_si64(__m128 __a)
1431 {
1432   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1433 }
1434 
1435 #endif
1436 
1437 /// Converts two low-order float values in a 128-bit vector of
1438 ///    [4 x float] into a 64-bit vector of [2 x i32].
1439 ///
1440 ///    If a converted value does not fit in a 32-bit integer, raises a
1441 ///    floating-point invalid exception. If the exception is masked, returns
1442 ///    the most negative integer.
1443 ///
1444 /// \headerfile <x86intrin.h>
1445 ///
1446 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1447 ///
1448 /// \param __a
1449 ///    A 128-bit vector of [4 x float].
1450 /// \returns A 64-bit integer vector containing the converted values.
1451 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1452 _mm_cvtps_pi32(__m128 __a)
1453 {
1454   return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1455 }
1456 
1457 /// Converts two low-order float values in a 128-bit vector of
1458 ///    [4 x float] into a 64-bit vector of [2 x i32].
1459 ///
1460 ///    If a converted value does not fit in a 32-bit integer, raises a
1461 ///    floating-point invalid exception. If the exception is masked, returns
1462 ///    the most negative integer.
1463 ///
1464 /// \headerfile <x86intrin.h>
1465 ///
1466 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1467 ///
1468 /// \param __a
1469 ///    A 128-bit vector of [4 x float].
1470 /// \returns A 64-bit integer vector containing the converted values.
1471 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1472 _mm_cvt_ps2pi(__m128 __a)
1473 {
1474   return _mm_cvtps_pi32(__a);
1475 }
1476 
1477 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1478 ///    truncated (rounded toward zero) 32-bit integer.
1479 ///
1480 ///    If the converted value does not fit in a 32-bit integer, raises a
1481 ///    floating-point invalid exception. If the exception is masked, returns
1482 ///    the most negative integer.
1483 ///
1484 /// \headerfile <x86intrin.h>
1485 ///
1486 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1487 ///   instructions.
1488 ///
1489 /// \param __a
1490 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1491 ///    used in the conversion.
1492 /// \returns A 32-bit integer containing the converted value.
1493 static __inline__ int __DEFAULT_FN_ATTRS
1494 _mm_cvttss_si32(__m128 __a)
1495 {
1496   return __builtin_ia32_cvttss2si((__v4sf)__a);
1497 }
1498 
1499 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1500 ///    truncated (rounded toward zero) 32-bit integer.
1501 ///
1502 ///    If the converted value does not fit in a 32-bit integer, raises a
1503 ///    floating-point invalid exception. If the exception is masked, returns
1504 ///    the most negative integer.
1505 ///
1506 /// \headerfile <x86intrin.h>
1507 ///
1508 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1509 ///   instructions.
1510 ///
1511 /// \param __a
1512 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1513 ///    used in the conversion.
1514 /// \returns A 32-bit integer containing the converted value.
1515 static __inline__ int __DEFAULT_FN_ATTRS
1516 _mm_cvtt_ss2si(__m128 __a)
1517 {
1518   return _mm_cvttss_si32(__a);
1519 }
1520 
1521 #ifdef __x86_64__
1522 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1523 ///    truncated (rounded toward zero) 64-bit integer.
1524 ///
1525 ///    If the converted value does not fit in a 64-bit integer, raises a
1526 ///    floating-point invalid exception. If the exception is masked, returns
1527 ///    the most negative integer.
1528 ///
1529 /// \headerfile <x86intrin.h>
1530 ///
1531 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1532 ///   instructions.
1533 ///
1534 /// \param __a
1535 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1536 ///    used in the conversion.
1537 /// \returns A 64-bit integer containing the converted value.
1538 static __inline__ long long __DEFAULT_FN_ATTRS
1539 _mm_cvttss_si64(__m128 __a)
1540 {
1541   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1542 }
1543 #endif
1544 
1545 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1546 ///    into two signed truncated (rounded toward zero) 32-bit integers,
1547 ///    returned in a 64-bit vector of [2 x i32].
1548 ///
1549 ///    If a converted value does not fit in a 32-bit integer, raises a
1550 ///    floating-point invalid exception. If the exception is masked, returns
1551 ///    the most negative integer.
1552 ///
1553 /// \headerfile <x86intrin.h>
1554 ///
1555 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1556 ///   instructions.
1557 ///
1558 /// \param __a
1559 ///    A 128-bit vector of [4 x float].
1560 /// \returns A 64-bit integer vector containing the converted values.
1561 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1562 _mm_cvttps_pi32(__m128 __a)
1563 {
1564   return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1565 }
1566 
1567 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1568 ///    into two signed truncated (rounded toward zero) 64-bit integers,
1569 ///    returned in a 64-bit vector of [2 x i32].
1570 ///
1571 ///    If a converted value does not fit in a 32-bit integer, raises a
1572 ///    floating-point invalid exception. If the exception is masked, returns
1573 ///    the most negative integer.
1574 ///
1575 /// \headerfile <x86intrin.h>
1576 ///
1577 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1578 ///
1579 /// \param __a
1580 ///    A 128-bit vector of [4 x float].
1581 /// \returns A 64-bit integer vector containing the converted values.
1582 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
1583 _mm_cvtt_ps2pi(__m128 __a)
1584 {
1585   return _mm_cvttps_pi32(__a);
1586 }
1587 
1588 /// Converts a 32-bit signed integer value into a floating point value
1589 ///    and writes it to the lower 32 bits of the destination. The remaining
1590 ///    higher order elements of the destination vector are copied from the
1591 ///    corresponding elements in the first operand.
1592 ///
1593 /// \headerfile <x86intrin.h>
1594 ///
1595 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1596 ///
1597 /// \param __a
1598 ///    A 128-bit vector of [4 x float].
1599 /// \param __b
1600 ///    A 32-bit signed integer operand containing the value to be converted.
1601 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1602 ///    converted value of the second operand. The upper 96 bits are copied from
1603 ///    the upper 96 bits of the first operand.
1604 static __inline__ __m128 __DEFAULT_FN_ATTRS
1605 _mm_cvtsi32_ss(__m128 __a, int __b)
1606 {
1607   __a[0] = __b;
1608   return __a;
1609 }
1610 
1611 /// Converts a 32-bit signed integer value into a floating point value
1612 ///    and writes it to the lower 32 bits of the destination. The remaining
1613 ///    higher order elements of the destination are copied from the
1614 ///    corresponding elements in the first operand.
1615 ///
1616 /// \headerfile <x86intrin.h>
1617 ///
1618 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1619 ///
1620 /// \param __a
1621 ///    A 128-bit vector of [4 x float].
1622 /// \param __b
1623 ///    A 32-bit signed integer operand containing the value to be converted.
1624 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1625 ///    converted value of the second operand. The upper 96 bits are copied from
1626 ///    the upper 96 bits of the first operand.
1627 static __inline__ __m128 __DEFAULT_FN_ATTRS
1628 _mm_cvt_si2ss(__m128 __a, int __b)
1629 {
1630   return _mm_cvtsi32_ss(__a, __b);
1631 }
1632 
1633 #ifdef __x86_64__
1634 
1635 /// Converts a 64-bit signed integer value into a floating point value
1636 ///    and writes it to the lower 32 bits of the destination. The remaining
1637 ///    higher order elements of the destination are copied from the
1638 ///    corresponding elements in the first operand.
1639 ///
1640 /// \headerfile <x86intrin.h>
1641 ///
1642 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1643 ///
1644 /// \param __a
1645 ///    A 128-bit vector of [4 x float].
1646 /// \param __b
1647 ///    A 64-bit signed integer operand containing the value to be converted.
1648 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1649 ///    converted value of the second operand. The upper 96 bits are copied from
1650 ///    the upper 96 bits of the first operand.
1651 static __inline__ __m128 __DEFAULT_FN_ATTRS
1652 _mm_cvtsi64_ss(__m128 __a, long long __b)
1653 {
1654   __a[0] = __b;
1655   return __a;
1656 }
1657 
1658 #endif
1659 
1660 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1661 ///    floating point values and writes them to the lower 64-bits of the
1662 ///    destination. The remaining higher order elements of the destination are
1663 ///    copied from the corresponding elements in the first operand.
1664 ///
1665 /// \headerfile <x86intrin.h>
1666 ///
1667 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1668 ///
1669 /// \param __a
1670 ///    A 128-bit vector of [4 x float].
1671 /// \param __b
1672 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1673 ///    and written to the corresponding low-order elements in the destination.
1674 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1675 ///    converted value of the second operand. The upper 64 bits are copied from
1676 ///    the upper 64 bits of the first operand.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1678 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1679 {
1680   return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1681 }
1682 
1683 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1684 ///    floating point values and writes them to the lower 64-bits of the
1685 ///    destination. The remaining higher order elements of the destination are
1686 ///    copied from the corresponding elements in the first operand.
1687 ///
1688 /// \headerfile <x86intrin.h>
1689 ///
1690 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1691 ///
1692 /// \param __a
1693 ///    A 128-bit vector of [4 x float].
1694 /// \param __b
1695 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1696 ///    and written to the corresponding low-order elements in the destination.
1697 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1698 ///    converted value from the second operand. The upper 64 bits are copied
1699 ///    from the upper 64 bits of the first operand.
1700 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
1701 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1702 {
1703   return _mm_cvtpi32_ps(__a, __b);
1704 }
1705 
1706 /// Extracts a float value contained in the lower 32 bits of a vector of
1707 ///    [4 x float].
1708 ///
1709 /// \headerfile <x86intrin.h>
1710 ///
1711 /// This intrinsic has no corresponding instruction.
1712 ///
1713 /// \param __a
1714 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1715 ///    used in the extraction.
1716 /// \returns A 32-bit float containing the extracted value.
1717 static __inline__ float __DEFAULT_FN_ATTRS
1718 _mm_cvtss_f32(__m128 __a)
1719 {
1720   return __a[0];
1721 }
1722 
1723 /// Loads two packed float values from the address \a __p into the
1724 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1725 ///     are copied from the low-order bits of the first operand.
1726 ///
1727 /// \headerfile <x86intrin.h>
1728 ///
1729 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1730 ///
1731 /// \param __a
1732 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1733 ///    of the destination.
1734 /// \param __p
1735 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1736 ///    [127:64] of the destination.
1737 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1738 static __inline__ __m128 __DEFAULT_FN_ATTRS
1739 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1740 {
1741   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1742   struct __mm_loadh_pi_struct {
1743     __mm_loadh_pi_v2f32 __u;
1744   } __attribute__((__packed__, __may_alias__));
1745   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1746   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1747   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1748 }
1749 
1750 /// Loads two packed float values from the address \a __p into the
1751 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1752 ///    are copied from the high-order bits of the first operand.
1753 ///
1754 /// \headerfile <x86intrin.h>
1755 ///
1756 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1757 ///
1758 /// \param __a
1759 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1760 ///    [127:64] of the destination.
1761 /// \param __p
1762 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1763 ///    [63:0] of the destination.
1764 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1765 static __inline__ __m128 __DEFAULT_FN_ATTRS
1766 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1767 {
1768   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1769   struct __mm_loadl_pi_struct {
1770     __mm_loadl_pi_v2f32 __u;
1771   } __attribute__((__packed__, __may_alias__));
1772   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1773   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1774   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1775 }
1776 
1777 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1778 ///    32 bits of the vector are initialized with the single-precision
1779 ///    floating-point value loaded from a specified memory location. The upper
1780 ///    96 bits are set to zero.
1781 ///
1782 /// \headerfile <x86intrin.h>
1783 ///
1784 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1785 ///
1786 /// \param __p
1787 ///    A pointer to a 32-bit memory location containing a single-precision
1788 ///    floating-point value.
1789 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1790 ///    lower 32 bits contain the value loaded from the memory location. The
1791 ///    upper 96 bits are set to zero.
1792 static __inline__ __m128 __DEFAULT_FN_ATTRS
1793 _mm_load_ss(const float *__p)
1794 {
1795   struct __mm_load_ss_struct {
1796     float __u;
1797   } __attribute__((__packed__, __may_alias__));
1798   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1799   return __extension__ (__m128){ __u, 0, 0, 0 };
1800 }
1801 
1802 /// Loads a 32-bit float value and duplicates it to all four vector
1803 ///    elements of a 128-bit vector of [4 x float].
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1808 ///    instruction.
1809 ///
1810 /// \param __p
1811 ///    A pointer to a float value to be loaded and duplicated.
1812 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1813 ///    duplicated values.
1814 static __inline__ __m128 __DEFAULT_FN_ATTRS
1815 _mm_load1_ps(const float *__p)
1816 {
1817   struct __mm_load1_ps_struct {
1818     float __u;
1819   } __attribute__((__packed__, __may_alias__));
1820   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1821   return __extension__ (__m128){ __u, __u, __u, __u };
1822 }
1823 
1824 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1825 
1826 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1827 ///    memory location.
1828 ///
1829 /// \headerfile <x86intrin.h>
1830 ///
1831 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1832 ///
1833 /// \param __p
1834 ///    A pointer to a 128-bit memory location. The address of the memory
1835 ///    location has to be 128-bit aligned.
1836 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1837 static __inline__ __m128 __DEFAULT_FN_ATTRS
1838 _mm_load_ps(const float *__p)
1839 {
1840   return *(const __m128*)__p;
1841 }
1842 
1843 /// Loads a 128-bit floating-point vector of [4 x float] from an
1844 ///    unaligned memory location.
1845 ///
1846 /// \headerfile <x86intrin.h>
1847 ///
1848 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1849 ///
1850 /// \param __p
1851 ///    A pointer to a 128-bit memory location. The address of the memory
1852 ///    location does not have to be aligned.
1853 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1854 static __inline__ __m128 __DEFAULT_FN_ATTRS
1855 _mm_loadu_ps(const float *__p)
1856 {
1857   struct __loadu_ps {
1858     __m128_u __v;
1859   } __attribute__((__packed__, __may_alias__));
1860   return ((const struct __loadu_ps*)__p)->__v;
1861 }
1862 
1863 /// Loads four packed float values, in reverse order, from an aligned
1864 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1865 ///
1866 /// \headerfile <x86intrin.h>
1867 ///
1868 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1869 ///    instruction.
1870 ///
1871 /// \param __p
1872 ///    A pointer to a 128-bit memory location. The address of the memory
1873 ///    location has to be 128-bit aligned.
1874 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1875 ///    in reverse order.
1876 static __inline__ __m128 __DEFAULT_FN_ATTRS
1877 _mm_loadr_ps(const float *__p)
1878 {
1879   __m128 __a = _mm_load_ps(__p);
1880   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1881 }
1882 
1883 /// Create a 128-bit vector of [4 x float] with undefined values.
1884 ///
1885 /// \headerfile <x86intrin.h>
1886 ///
1887 /// This intrinsic has no corresponding instruction.
1888 ///
1889 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1890 static __inline__ __m128 __DEFAULT_FN_ATTRS
1891 _mm_undefined_ps(void)
1892 {
1893   return (__m128)__builtin_ia32_undef128();
1894 }
1895 
1896 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1897 ///    32 bits of the vector are initialized with the specified single-precision
1898 ///    floating-point value. The upper 96 bits are set to zero.
1899 ///
1900 /// \headerfile <x86intrin.h>
1901 ///
1902 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1903 ///
1904 /// \param __w
1905 ///    A single-precision floating-point value used to initialize the lower 32
1906 ///    bits of the result.
1907 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1908 ///    lower 32 bits contain the value provided in the source operand. The
1909 ///    upper 96 bits are set to zero.
1910 static __inline__ __m128 __DEFAULT_FN_ATTRS
1911 _mm_set_ss(float __w)
1912 {
1913   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1914 }
1915 
1916 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1917 ///    of the four single-precision floating-point vector elements set to the
1918 ///    specified single-precision floating-point value.
1919 ///
1920 /// \headerfile <x86intrin.h>
1921 ///
1922 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1923 ///
1924 /// \param __w
1925 ///    A single-precision floating-point value used to initialize each vector
1926 ///    element of the result.
1927 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1928 static __inline__ __m128 __DEFAULT_FN_ATTRS
1929 _mm_set1_ps(float __w)
1930 {
1931   return __extension__ (__m128){ __w, __w, __w, __w };
1932 }
1933 
1934 /* Microsoft specific. */
1935 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1936 ///    of the four single-precision floating-point vector elements set to the
1937 ///    specified single-precision floating-point value.
1938 ///
1939 /// \headerfile <x86intrin.h>
1940 ///
1941 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1942 ///
1943 /// \param __w
1944 ///    A single-precision floating-point value used to initialize each vector
1945 ///    element of the result.
1946 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1947 static __inline__ __m128 __DEFAULT_FN_ATTRS
1948 _mm_set_ps1(float __w)
1949 {
1950     return _mm_set1_ps(__w);
1951 }
1952 
1953 /// Constructs a 128-bit floating-point vector of [4 x float]
1954 ///    initialized with the specified single-precision floating-point values.
1955 ///
1956 /// \headerfile <x86intrin.h>
1957 ///
1958 /// This intrinsic is a utility function and does not correspond to a specific
1959 ///    instruction.
1960 ///
1961 /// \param __z
1962 ///    A single-precision floating-point value used to initialize bits [127:96]
1963 ///    of the result.
1964 /// \param __y
1965 ///    A single-precision floating-point value used to initialize bits [95:64]
1966 ///    of the result.
1967 /// \param __x
1968 ///    A single-precision floating-point value used to initialize bits [63:32]
1969 ///    of the result.
1970 /// \param __w
1971 ///    A single-precision floating-point value used to initialize bits [31:0]
1972 ///    of the result.
1973 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1974 static __inline__ __m128 __DEFAULT_FN_ATTRS
1975 _mm_set_ps(float __z, float __y, float __x, float __w)
1976 {
1977   return __extension__ (__m128){ __w, __x, __y, __z };
1978 }
1979 
1980 /// Constructs a 128-bit floating-point vector of [4 x float],
1981 ///    initialized in reverse order with the specified 32-bit single-precision
1982 ///    float-point values.
1983 ///
1984 /// \headerfile <x86intrin.h>
1985 ///
1986 /// This intrinsic is a utility function and does not correspond to a specific
1987 ///    instruction.
1988 ///
1989 /// \param __z
1990 ///    A single-precision floating-point value used to initialize bits [31:0]
1991 ///    of the result.
1992 /// \param __y
1993 ///    A single-precision floating-point value used to initialize bits [63:32]
1994 ///    of the result.
1995 /// \param __x
1996 ///    A single-precision floating-point value used to initialize bits [95:64]
1997 ///    of the result.
1998 /// \param __w
1999 ///    A single-precision floating-point value used to initialize bits [127:96]
2000 ///    of the result.
2001 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2002 static __inline__ __m128 __DEFAULT_FN_ATTRS
2003 _mm_setr_ps(float __z, float __y, float __x, float __w)
2004 {
2005   return __extension__ (__m128){ __z, __y, __x, __w };
2006 }
2007 
2008 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2009 ///    to zero.
2010 ///
2011 /// \headerfile <x86intrin.h>
2012 ///
2013 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2014 ///
2015 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2016 ///    all elements set to zero.
2017 static __inline__ __m128 __DEFAULT_FN_ATTRS
2018 _mm_setzero_ps(void)
2019 {
2020   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2021 }
2022 
2023 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2024 ///    memory location.
2025 ///
2026 /// \headerfile <x86intrin.h>
2027 ///
2028 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2029 ///
2030 /// \param __p
2031 ///    A pointer to a 64-bit memory location.
2032 /// \param __a
2033 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2034 static __inline__ void __DEFAULT_FN_ATTRS
2035 _mm_storeh_pi(__m64 *__p, __m128 __a)
2036 {
2037   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2038   struct __mm_storeh_pi_struct {
2039     __mm_storeh_pi_v2f32 __u;
2040   } __attribute__((__packed__, __may_alias__));
2041   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2042 }
2043 
2044 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2045 ///     memory location.
2046 ///
2047 /// \headerfile <x86intrin.h>
2048 ///
2049 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2050 ///
2051 /// \param __p
2052 ///    A pointer to a memory location that will receive the float values.
2053 /// \param __a
2054 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2055 static __inline__ void __DEFAULT_FN_ATTRS
2056 _mm_storel_pi(__m64 *__p, __m128 __a)
2057 {
2058   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2059   struct __mm_storeh_pi_struct {
2060     __mm_storeh_pi_v2f32 __u;
2061   } __attribute__((__packed__, __may_alias__));
2062   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2063 }
2064 
2065 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2066 ///     memory location.
2067 ///
2068 /// \headerfile <x86intrin.h>
2069 ///
2070 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2071 ///
2072 /// \param __p
2073 ///    A pointer to a 32-bit memory location.
2074 /// \param __a
2075 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2076 static __inline__ void __DEFAULT_FN_ATTRS
2077 _mm_store_ss(float *__p, __m128 __a)
2078 {
2079   struct __mm_store_ss_struct {
2080     float __u;
2081   } __attribute__((__packed__, __may_alias__));
2082   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2083 }
2084 
2085 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2086 ///    location.
2087 ///
2088 /// \headerfile <x86intrin.h>
2089 ///
2090 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2091 ///
2092 /// \param __p
2093 ///    A pointer to a 128-bit memory location. The address of the memory
2094 ///    location does not have to be aligned.
2095 /// \param __a
2096 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2097 static __inline__ void __DEFAULT_FN_ATTRS
2098 _mm_storeu_ps(float *__p, __m128 __a)
2099 {
2100   struct __storeu_ps {
2101     __m128_u __v;
2102   } __attribute__((__packed__, __may_alias__));
2103   ((struct __storeu_ps*)__p)->__v = __a;
2104 }
2105 
2106 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2107 ///    location.
2108 ///
2109 /// \headerfile <x86intrin.h>
2110 ///
2111 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2112 ///
2113 /// \param __p
2114 ///    A pointer to a 128-bit memory location. The address of the memory
2115 ///    location has to be 16-byte aligned.
2116 /// \param __a
2117 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2118 static __inline__ void __DEFAULT_FN_ATTRS
2119 _mm_store_ps(float *__p, __m128 __a)
2120 {
2121   *(__m128*)__p = __a;
2122 }
2123 
2124 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2125 ///    four contiguous elements in an aligned memory location.
2126 ///
2127 /// \headerfile <x86intrin.h>
2128 ///
2129 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2130 ///    instruction.
2131 ///
2132 /// \param __p
2133 ///    A pointer to a 128-bit memory location.
2134 /// \param __a
2135 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2136 ///    of the four contiguous elements pointed by \a __p.
2137 static __inline__ void __DEFAULT_FN_ATTRS
2138 _mm_store1_ps(float *__p, __m128 __a)
2139 {
2140   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2141   _mm_store_ps(__p, __a);
2142 }
2143 
2144 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2145 ///    four contiguous elements in an aligned memory location.
2146 ///
2147 /// \headerfile <x86intrin.h>
2148 ///
2149 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2150 ///    instruction.
2151 ///
2152 /// \param __p
2153 ///    A pointer to a 128-bit memory location.
2154 /// \param __a
2155 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2156 ///    of the four contiguous elements pointed by \a __p.
2157 static __inline__ void __DEFAULT_FN_ATTRS
2158 _mm_store_ps1(float *__p, __m128 __a)
2159 {
2160   _mm_store1_ps(__p, __a);
2161 }
2162 
2163 /// Stores float values from a 128-bit vector of [4 x float] to an
2164 ///    aligned memory location in reverse order.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2169 ///    instruction.
2170 ///
2171 /// \param __p
2172 ///    A pointer to a 128-bit memory location. The address of the memory
2173 ///    location has to be 128-bit aligned.
2174 /// \param __a
2175 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2176 static __inline__ void __DEFAULT_FN_ATTRS
2177 _mm_storer_ps(float *__p, __m128 __a)
2178 {
2179   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2180   _mm_store_ps(__p, __a);
2181 }
2182 
2183 #define _MM_HINT_ET0 7
2184 #define _MM_HINT_ET1 6
2185 #define _MM_HINT_T0  3
2186 #define _MM_HINT_T1  2
2187 #define _MM_HINT_T2  1
2188 #define _MM_HINT_NTA 0
2189 
2190 #ifndef _MSC_VER
2191 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2192    Sema doesn't do any form of constant propagation yet. */
2193 
2194 /// Loads one cache line of data from the specified address to a location
2195 ///    closer to the processor.
2196 ///
2197 /// \headerfile <x86intrin.h>
2198 ///
2199 /// \code
2200 /// void _mm_prefetch(const void *a, const int sel);
2201 /// \endcode
2202 ///
2203 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2204 ///
2205 /// \param a
2206 ///    A pointer to a memory location containing a cache line of data.
2207 /// \param sel
2208 ///    A predefined integer constant specifying the type of prefetch
2209 ///    operation: \n
2210 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2211 ///    PREFETCHNTA instruction will be generated. \n
2212 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2213 ///    be generated. \n
2214 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2215 ///    be generated. \n
2216 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2217 ///    be generated.
2218 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2219                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2220 #endif
2221 
2222 /// Stores a 64-bit integer in the specified aligned memory location. To
2223 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2224 ///    used again soon).
2225 ///
2226 /// \headerfile <x86intrin.h>
2227 ///
2228 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2229 ///
2230 /// \param __p
2231 ///    A pointer to an aligned memory location used to store the register value.
2232 /// \param __a
2233 ///    A 64-bit integer containing the value to be stored.
2234 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2235 _mm_stream_pi(void *__p, __m64 __a)
2236 {
2237   __builtin_ia32_movntq((__m64 *)__p, __a);
2238 }
2239 
2240 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2241 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2242 ///    as non-temporal (unlikely to be used again soon).
2243 ///
2244 /// \headerfile <x86intrin.h>
2245 ///
2246 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2247 ///
2248 /// \param __p
2249 ///    A pointer to a 128-bit aligned memory location that will receive the
2250 ///    single-precision floating-point values.
2251 /// \param __a
2252 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2253 static __inline__ void __DEFAULT_FN_ATTRS
2254 _mm_stream_ps(void *__p, __m128 __a)
2255 {
2256   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2257 }
2258 
2259 #if defined(__cplusplus)
2260 extern "C" {
2261 #endif
2262 
2263 /// Forces strong memory ordering (serialization) between store
2264 ///    instructions preceding this instruction and store instructions following
2265 ///    this instruction, ensuring the system completes all previous stores
2266 ///    before executing subsequent stores.
2267 ///
2268 /// \headerfile <x86intrin.h>
2269 ///
2270 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2271 ///
2272 void _mm_sfence(void);
2273 
2274 #if defined(__cplusplus)
2275 } // extern "C"
2276 #endif
2277 
2278 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2279 ///    returns it, as specified by the immediate integer operand.
2280 ///
2281 /// \headerfile <x86intrin.h>
2282 ///
2283 /// \code
2284 /// int _mm_extract_pi16(__m64 a, int n);
2285 /// \endcode
2286 ///
2287 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2288 ///
2289 /// \param a
2290 ///    A 64-bit vector of [4 x i16].
2291 /// \param n
2292 ///    An immediate integer operand that determines which bits are extracted: \n
2293 ///    0: Bits [15:0] are copied to the destination. \n
2294 ///    1: Bits [31:16] are copied to the destination. \n
2295 ///    2: Bits [47:32] are copied to the destination. \n
2296 ///    3: Bits [63:48] are copied to the destination.
2297 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2298 #define _mm_extract_pi16(a, n) \
2299   ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2300 
2301 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2302 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2303 ///    specified by the immediate operand \a n.
2304 ///
2305 /// \headerfile <x86intrin.h>
2306 ///
2307 /// \code
2308 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2309 /// \endcode
2310 ///
2311 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2312 ///
2313 /// \param a
2314 ///    A 64-bit vector of [4 x i16].
2315 /// \param d
2316 ///    An integer. The lower 16-bit value from this operand is written to the
2317 ///    destination at the offset specified by operand \a n.
2318 /// \param n
2319 ///    An immediate integer operant that determines which the bits to be used
2320 ///    in the destination. \n
2321 ///    0: Bits [15:0] are copied to the destination. \n
2322 ///    1: Bits [31:16] are copied to the destination. \n
2323 ///    2: Bits [47:32] are copied to the destination. \n
2324 ///    3: Bits [63:48] are copied to the destination.  \n
2325 ///    The remaining bits in the destination are copied from the corresponding
2326 ///    bits in operand \a a.
2327 /// \returns A 64-bit integer vector containing the copied packed data from the
2328 ///    operands.
2329 #define _mm_insert_pi16(a, d, n) \
2330   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2331 
2332 /// Compares each of the corresponding packed 16-bit integer values of
2333 ///    the 64-bit integer vectors, and writes the greater value to the
2334 ///    corresponding bits in the destination.
2335 ///
2336 /// \headerfile <x86intrin.h>
2337 ///
2338 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2339 ///
2340 /// \param __a
2341 ///    A 64-bit integer vector containing one of the source operands.
2342 /// \param __b
2343 ///    A 64-bit integer vector containing one of the source operands.
2344 /// \returns A 64-bit integer vector containing the comparison results.
2345 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2346 _mm_max_pi16(__m64 __a, __m64 __b)
2347 {
2348   return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2349 }
2350 
2351 /// Compares each of the corresponding packed 8-bit unsigned integer
2352 ///    values of the 64-bit integer vectors, and writes the greater value to the
2353 ///    corresponding bits in the destination.
2354 ///
2355 /// \headerfile <x86intrin.h>
2356 ///
2357 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2358 ///
2359 /// \param __a
2360 ///    A 64-bit integer vector containing one of the source operands.
2361 /// \param __b
2362 ///    A 64-bit integer vector containing one of the source operands.
2363 /// \returns A 64-bit integer vector containing the comparison results.
2364 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2365 _mm_max_pu8(__m64 __a, __m64 __b)
2366 {
2367   return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2368 }
2369 
2370 /// Compares each of the corresponding packed 16-bit integer values of
2371 ///    the 64-bit integer vectors, and writes the lesser value to the
2372 ///    corresponding bits in the destination.
2373 ///
2374 /// \headerfile <x86intrin.h>
2375 ///
2376 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2377 ///
2378 /// \param __a
2379 ///    A 64-bit integer vector containing one of the source operands.
2380 /// \param __b
2381 ///    A 64-bit integer vector containing one of the source operands.
2382 /// \returns A 64-bit integer vector containing the comparison results.
2383 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2384 _mm_min_pi16(__m64 __a, __m64 __b)
2385 {
2386   return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2387 }
2388 
2389 /// Compares each of the corresponding packed 8-bit unsigned integer
2390 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2391 ///    corresponding bits in the destination.
2392 ///
2393 /// \headerfile <x86intrin.h>
2394 ///
2395 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2396 ///
2397 /// \param __a
2398 ///    A 64-bit integer vector containing one of the source operands.
2399 /// \param __b
2400 ///    A 64-bit integer vector containing one of the source operands.
2401 /// \returns A 64-bit integer vector containing the comparison results.
2402 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2403 _mm_min_pu8(__m64 __a, __m64 __b)
2404 {
2405   return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2406 }
2407 
2408 /// Takes the most significant bit from each 8-bit element in a 64-bit
2409 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2410 ///    32-bit integer and writes it to the destination.
2411 ///
2412 /// \headerfile <x86intrin.h>
2413 ///
2414 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2415 ///
2416 /// \param __a
2417 ///    A 64-bit integer vector containing the values with bits to be extracted.
2418 /// \returns The most significant bit from each 8-bit element in \a __a,
2419 ///    written to bits [7:0].
2420 static __inline__ int __DEFAULT_FN_ATTRS_MMX
2421 _mm_movemask_pi8(__m64 __a)
2422 {
2423   return __builtin_ia32_pmovmskb((__v8qi)__a);
2424 }
2425 
2426 /// Multiplies packed 16-bit unsigned integer values and writes the
2427 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2428 ///    the destination.
2429 ///
2430 /// \headerfile <x86intrin.h>
2431 ///
2432 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2433 ///
2434 /// \param __a
2435 ///    A 64-bit integer vector containing one of the source operands.
2436 /// \param __b
2437 ///    A 64-bit integer vector containing one of the source operands.
2438 /// \returns A 64-bit integer vector containing the products of both operands.
2439 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2440 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2441 {
2442   return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2443 }
2444 
2445 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2446 ///    destination, as specified by the immediate value operand.
2447 ///
2448 /// \headerfile <x86intrin.h>
2449 ///
2450 /// \code
2451 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2452 /// \endcode
2453 ///
2454 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2455 ///
2456 /// \param a
2457 ///    A 64-bit integer vector containing the values to be shuffled.
2458 /// \param n
2459 ///    An immediate value containing an 8-bit value specifying which elements to
2460 ///    copy from \a a. The destinations within the 64-bit destination are
2461 ///    assigned values as follows: \n
2462 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2463 ///    destination. \n
2464 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2465 ///    destination. \n
2466 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2467 ///    destination. \n
2468 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2469 ///    destination. \n
2470 ///    Bit value assignments: \n
2471 ///    00: assigned from bits [15:0] of \a a. \n
2472 ///    01: assigned from bits [31:16] of \a a. \n
2473 ///    10: assigned from bits [47:32] of \a a. \n
2474 ///    11: assigned from bits [63:48] of \a a. \n
2475 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2476 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2477 ///    <c>[b6, b4, b2, b0]</c>.
2478 /// \returns A 64-bit integer vector containing the shuffled values.
2479 #define _mm_shuffle_pi16(a, n) \
2480   ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2481 
2482 /// Conditionally copies the values from each 8-bit element in the first
2483 ///    64-bit integer vector operand to the specified memory location, as
2484 ///    specified by the most significant bit in the corresponding element in the
2485 ///    second 64-bit integer vector operand.
2486 ///
2487 ///    To minimize caching, the data is flagged as non-temporal
2488 ///    (unlikely to be used again soon).
2489 ///
2490 /// \headerfile <x86intrin.h>
2491 ///
2492 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2493 ///
2494 /// \param __d
2495 ///    A 64-bit integer vector containing the values with elements to be copied.
2496 /// \param __n
2497 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2498 ///    element determines whether the corresponding element in operand \a __d
2499 ///    is copied. If the most significant bit of a given element is 1, the
2500 ///    corresponding element in operand \a __d is copied.
2501 /// \param __p
2502 ///    A pointer to a 64-bit memory location that will receive the conditionally
2503 ///    copied integer values. The address of the memory location does not have
2504 ///    to be aligned.
2505 static __inline__ void __DEFAULT_FN_ATTRS_MMX
2506 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2507 {
2508   __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2509 }
2510 
2511 /// Computes the rounded averages of the packed unsigned 8-bit integer
2512 ///    values and writes the averages to the corresponding bits in the
2513 ///    destination.
2514 ///
2515 /// \headerfile <x86intrin.h>
2516 ///
2517 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2518 ///
2519 /// \param __a
2520 ///    A 64-bit integer vector containing one of the source operands.
2521 /// \param __b
2522 ///    A 64-bit integer vector containing one of the source operands.
2523 /// \returns A 64-bit integer vector containing the averages of both operands.
2524 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2525 _mm_avg_pu8(__m64 __a, __m64 __b)
2526 {
2527   return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2528 }
2529 
2530 /// Computes the rounded averages of the packed unsigned 16-bit integer
2531 ///    values and writes the averages to the corresponding bits in the
2532 ///    destination.
2533 ///
2534 /// \headerfile <x86intrin.h>
2535 ///
2536 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2537 ///
2538 /// \param __a
2539 ///    A 64-bit integer vector containing one of the source operands.
2540 /// \param __b
2541 ///    A 64-bit integer vector containing one of the source operands.
2542 /// \returns A 64-bit integer vector containing the averages of both operands.
2543 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2544 _mm_avg_pu16(__m64 __a, __m64 __b)
2545 {
2546   return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2547 }
2548 
2549 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2550 ///    64-bit vector operands and computes the absolute value for each of the
2551 ///    difference. Then sum of the 8 absolute differences is written to the
2552 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2553 ///
2554 /// \headerfile <x86intrin.h>
2555 ///
2556 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2557 ///
2558 /// \param __a
2559 ///    A 64-bit integer vector containing one of the source operands.
2560 /// \param __b
2561 ///    A 64-bit integer vector containing one of the source operands.
2562 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2563 ///    sets of absolute differences between both operands. The upper bits are
2564 ///    cleared.
2565 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2566 _mm_sad_pu8(__m64 __a, __m64 __b)
2567 {
2568   return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2569 }
2570 
2571 #if defined(__cplusplus)
2572 extern "C" {
2573 #endif
2574 
2575 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2576 ///    integer value.
2577 ///
2578 ///    There are several groups of macros associated with this
2579 ///    intrinsic, including:
2580 ///    <ul>
2581 ///    <li>
2582 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2583 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2584 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2585 ///      _MM_GET_EXCEPTION_STATE().
2586 ///    </li>
2587 ///    <li>
2588 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2589 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2590 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2591 ///    </li>
2592 ///    <li>
2593 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2594 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2595 ///      _MM_GET_ROUNDING_MODE().
2596 ///    </li>
2597 ///    <li>
2598 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2599 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2600 ///    </li>
2601 ///    <li>
2602 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2603 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2604 ///      _MM_GET_DENORMALS_ZERO_MODE().
2605 ///    </li>
2606 ///    </ul>
2607 ///
2608 ///    For example, the following expression checks if an overflow exception has
2609 ///    occurred:
2610 ///    \code
2611 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2612 ///    \endcode
2613 ///
2614 ///    The following expression gets the current rounding mode:
2615 ///    \code
2616 ///      _MM_GET_ROUNDING_MODE()
2617 ///    \endcode
2618 ///
2619 /// \headerfile <x86intrin.h>
2620 ///
2621 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2622 ///
2623 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2624 ///    register.
2625 unsigned int _mm_getcsr(void);
2626 
2627 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2628 ///
2629 ///    There are several groups of macros associated with this intrinsic,
2630 ///    including:
2631 ///    <ul>
2632 ///    <li>
2633 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2634 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2635 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2636 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2637 ///    </li>
2638 ///    <li>
2639 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2640 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2641 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2642 ///      of these macros.
2643 ///    </li>
2644 ///    <li>
2645 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2646 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2647 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2648 ///    </li>
2649 ///    <li>
2650 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2651 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2652 ///      one of these macros.
2653 ///    </li>
2654 ///    <li>
2655 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2656 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2657 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2658 ///    </li>
2659 ///    </ul>
2660 ///
2661 ///    For example, the following expression causes subsequent floating-point
2662 ///    operations to round up:
2663 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2664 ///
2665 ///    The following example sets the DAZ and FTZ flags:
2666 ///    \code
2667 ///    void setFlags() {
2668 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2669 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2670 ///    }
2671 ///    \endcode
2672 ///
2673 /// \headerfile <x86intrin.h>
2674 ///
2675 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2676 ///
2677 /// \param __i
2678 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2679 void _mm_setcsr(unsigned int __i);
2680 
2681 #if defined(__cplusplus)
2682 } // extern "C"
2683 #endif
2684 
2685 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2686 ///    specified by the immediate value operand.
2687 ///
2688 /// \headerfile <x86intrin.h>
2689 ///
2690 /// \code
2691 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2692 /// \endcode
2693 ///
2694 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2695 ///
2696 /// \param a
2697 ///    A 128-bit vector of [4 x float].
2698 /// \param b
2699 ///    A 128-bit vector of [4 x float].
2700 /// \param mask
2701 ///    An immediate value containing an 8-bit value specifying which elements to
2702 ///    copy from \a a and \a b. \n
2703 ///    Bits [3:0] specify the values copied from operand \a a. \n
2704 ///    Bits [7:4] specify the values copied from operand \a b. \n
2705 ///    The destinations within the 128-bit destination are assigned values as
2706 ///    follows: \n
2707 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2708 ///    destination. \n
2709 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2710 ///    destination. \n
2711 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2712 ///    destination. \n
2713 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2714 ///    destination. \n
2715 ///    Bit value assignments: \n
2716 ///    00: Bits [31:0] copied from the specified operand. \n
2717 ///    01: Bits [63:32] copied from the specified operand. \n
2718 ///    10: Bits [95:64] copied from the specified operand. \n
2719 ///    11: Bits [127:96] copied from the specified operand. \n
2720 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2721 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2722 ///    <c>[b6, b4, b2, b0]</c>.
2723 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2724 #define _mm_shuffle_ps(a, b, mask) \
2725   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2726                                  (int)(mask)))
2727 
2728 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2729 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2730 ///
2731 /// \headerfile <x86intrin.h>
2732 ///
2733 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2734 ///
2735 /// \param __a
2736 ///    A 128-bit vector of [4 x float]. \n
2737 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2738 ///    Bits [127:96] are written to bits [95:64] of the destination.
2739 /// \param __b
2740 ///    A 128-bit vector of [4 x float].
2741 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2742 ///    Bits [127:96] are written to bits [127:96] of the destination.
2743 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2744 static __inline__ __m128 __DEFAULT_FN_ATTRS
2745 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2746 {
2747   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2748 }
2749 
2750 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2751 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2752 ///
2753 /// \headerfile <x86intrin.h>
2754 ///
2755 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2756 ///
2757 /// \param __a
2758 ///    A 128-bit vector of [4 x float]. \n
2759 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2760 ///    Bits [63:32] are written to bits [95:64] of the destination.
2761 /// \param __b
2762 ///    A 128-bit vector of [4 x float]. \n
2763 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2764 ///    Bits [63:32] are written to bits [127:96] of the destination.
2765 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2766 static __inline__ __m128 __DEFAULT_FN_ATTRS
2767 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2768 {
2769   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2770 }
2771 
2772 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2773 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2774 ///    96 bits are set to the upper 96 bits of the first parameter.
2775 ///
2776 /// \headerfile <x86intrin.h>
2777 ///
2778 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2779 ///    instruction.
2780 ///
2781 /// \param __a
2782 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2783 ///    written to the upper 96 bits of the result.
2784 /// \param __b
2785 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2786 ///    written to the lower 32 bits of the result.
2787 /// \returns A 128-bit floating-point vector of [4 x float].
2788 static __inline__ __m128 __DEFAULT_FN_ATTRS
2789 _mm_move_ss(__m128 __a, __m128 __b)
2790 {
2791   __a[0] = __b[0];
2792   return __a;
2793 }
2794 
2795 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2796 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2797 ///    64 bits are set to the upper 64 bits of the first parameter.
2798 ///
2799 /// \headerfile <x86intrin.h>
2800 ///
2801 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2802 ///
2803 /// \param __a
2804 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2805 ///    written to the upper 64 bits of the result.
2806 /// \param __b
2807 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2808 ///    written to the lower 64 bits of the result.
2809 /// \returns A 128-bit floating-point vector of [4 x float].
2810 static __inline__ __m128 __DEFAULT_FN_ATTRS
2811 _mm_movehl_ps(__m128 __a, __m128 __b)
2812 {
2813   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2814 }
2815 
2816 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2817 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2818 ///    64 bits are set to the lower 64 bits of the second parameter.
2819 ///
2820 /// \headerfile <x86intrin.h>
2821 ///
2822 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2823 ///
2824 /// \param __a
2825 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2826 ///    written to the lower 64 bits of the result.
2827 /// \param __b
2828 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2829 ///    written to the upper 64 bits of the result.
2830 /// \returns A 128-bit floating-point vector of [4 x float].
2831 static __inline__ __m128 __DEFAULT_FN_ATTRS
2832 _mm_movelh_ps(__m128 __a, __m128 __b)
2833 {
2834   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2835 }
2836 
2837 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2838 ///    float].
2839 ///
2840 /// \headerfile <x86intrin.h>
2841 ///
2842 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2843 ///
2844 /// \param __a
2845 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2846 ///    from the corresponding elements in this operand.
2847 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2848 ///    values from the operand.
2849 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2850 _mm_cvtpi16_ps(__m64 __a)
2851 {
2852   __m64 __b, __c;
2853   __m128 __r;
2854 
2855   __b = _mm_setzero_si64();
2856   __b = _mm_cmpgt_pi16(__b, __a);
2857   __c = _mm_unpackhi_pi16(__a, __b);
2858   __r = _mm_setzero_ps();
2859   __r = _mm_cvtpi32_ps(__r, __c);
2860   __r = _mm_movelh_ps(__r, __r);
2861   __c = _mm_unpacklo_pi16(__a, __b);
2862   __r = _mm_cvtpi32_ps(__r, __c);
2863 
2864   return __r;
2865 }
2866 
2867 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2868 ///    128-bit vector of [4 x float].
2869 ///
2870 /// \headerfile <x86intrin.h>
2871 ///
2872 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2873 ///
2874 /// \param __a
2875 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2876 ///    destination are copied from the corresponding elements in this operand.
2877 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2878 ///    values from the operand.
2879 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2880 _mm_cvtpu16_ps(__m64 __a)
2881 {
2882   __m64 __b, __c;
2883   __m128 __r;
2884 
2885   __b = _mm_setzero_si64();
2886   __c = _mm_unpackhi_pi16(__a, __b);
2887   __r = _mm_setzero_ps();
2888   __r = _mm_cvtpi32_ps(__r, __c);
2889   __r = _mm_movelh_ps(__r, __r);
2890   __c = _mm_unpacklo_pi16(__a, __b);
2891   __r = _mm_cvtpi32_ps(__r, __c);
2892 
2893   return __r;
2894 }
2895 
2896 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2897 ///    into a 128-bit vector of [4 x float].
2898 ///
2899 /// \headerfile <x86intrin.h>
2900 ///
2901 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2902 ///
2903 /// \param __a
2904 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2905 ///    from the corresponding lower 4 elements in this operand.
2906 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2907 ///    values from the operand.
2908 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2909 _mm_cvtpi8_ps(__m64 __a)
2910 {
2911   __m64 __b;
2912 
2913   __b = _mm_setzero_si64();
2914   __b = _mm_cmpgt_pi8(__b, __a);
2915   __b = _mm_unpacklo_pi8(__a, __b);
2916 
2917   return _mm_cvtpi16_ps(__b);
2918 }
2919 
2920 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2921 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2922 ///
2923 /// \headerfile <x86intrin.h>
2924 ///
2925 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2926 ///
2927 /// \param __a
2928 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2929 ///    destination are copied from the corresponding lower 4 elements in this
2930 ///    operand.
2931 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2932 ///    values from the source operand.
2933 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2934 _mm_cvtpu8_ps(__m64 __a)
2935 {
2936   __m64 __b;
2937 
2938   __b = _mm_setzero_si64();
2939   __b = _mm_unpacklo_pi8(__a, __b);
2940 
2941   return _mm_cvtpi16_ps(__b);
2942 }
2943 
2944 /// Converts the two 32-bit signed integer values from each 64-bit vector
2945 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2946 ///
2947 /// \headerfile <x86intrin.h>
2948 ///
2949 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2950 ///
2951 /// \param __a
2952 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2953 ///    copied from the elements in this operand.
2954 /// \param __b
2955 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2956 ///    copied from the elements in this operand.
2957 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2958 ///    copied and converted values from the first operand. The upper 64 bits
2959 ///    contain the copied and converted values from the second operand.
2960 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
2961 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2962 {
2963   __m128 __c;
2964 
2965   __c = _mm_setzero_ps();
2966   __c = _mm_cvtpi32_ps(__c, __b);
2967   __c = _mm_movelh_ps(__c, __c);
2968 
2969   return _mm_cvtpi32_ps(__c, __a);
2970 }
2971 
2972 /// Converts each single-precision floating-point element of a 128-bit
2973 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2974 ///    packs the results into a 64-bit integer vector of [4 x i16].
2975 ///
2976 ///    If the floating-point element is NaN or infinity, or if the
2977 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2978 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2979 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2980 ///
2981 /// \headerfile <x86intrin.h>
2982 ///
2983 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2984 ///
2985 /// \param __a
2986 ///    A 128-bit floating-point vector of [4 x float].
2987 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2988 ///    values.
2989 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
2990 _mm_cvtps_pi16(__m128 __a)
2991 {
2992   __m64 __b, __c;
2993 
2994   __b = _mm_cvtps_pi32(__a);
2995   __a = _mm_movehl_ps(__a, __a);
2996   __c = _mm_cvtps_pi32(__a);
2997 
2998   return _mm_packs_pi32(__b, __c);
2999 }
3000 
3001 /// Converts each single-precision floating-point element of a 128-bit
3002 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
3003 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
3004 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
3005 ///
3006 ///    If the floating-point element is NaN or infinity, or if the
3007 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
3008 ///    is converted to 0x80. Otherwise if the floating-point element is greater
3009 ///    than 0x7F, it is converted to 0x7F.
3010 ///
3011 /// \headerfile <x86intrin.h>
3012 ///
3013 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3014 ///
3015 /// \param __a
3016 ///    128-bit floating-point vector of [4 x float].
3017 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3018 ///    converted values and the uppper 32 bits are set to zero.
3019 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
3020 _mm_cvtps_pi8(__m128 __a)
3021 {
3022   __m64 __b, __c;
3023 
3024   __b = _mm_cvtps_pi16(__a);
3025   __c = _mm_setzero_si64();
3026 
3027   return _mm_packs_pi16(__b, __c);
3028 }
3029 
3030 /// Extracts the sign bits from each single-precision floating-point
3031 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
3032 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3033 ///    to zero.
3034 ///
3035 /// \headerfile <x86intrin.h>
3036 ///
3037 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3038 ///
3039 /// \param __a
3040 ///    A 128-bit floating-point vector of [4 x float].
3041 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3042 ///    single-precision floating-point element of the parameter. Bits [31:4] are
3043 ///    set to zero.
3044 static __inline__ int __DEFAULT_FN_ATTRS
3045 _mm_movemask_ps(__m128 __a)
3046 {
3047   return __builtin_ia32_movmskps((__v4sf)__a);
3048 }
3049 
3050 /* Compare */
3051 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
3052 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
3053 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
3054 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
3055 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
3056 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
3057 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
3058 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
3059 
3060 /// Compares each of the corresponding values of two 128-bit vectors of
3061 ///    [4 x float], using the operation specified by the immediate integer
3062 ///    operand.
3063 ///
3064 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3065 ///    If either value in a comparison is NaN, comparisons that are ordered
3066 ///    return false, and comparisons that are unordered return true.
3067 ///
3068 /// \headerfile <x86intrin.h>
3069 ///
3070 /// \code
3071 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3072 /// \endcode
3073 ///
3074 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3075 ///
3076 /// \param a
3077 ///    A 128-bit vector of [4 x float].
3078 /// \param b
3079 ///    A 128-bit vector of [4 x float].
3080 /// \param c
3081 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3082 ///    operation to use: \n
3083 ///    0x00: Equal (ordered, non-signaling) \n
3084 ///    0x01: Less-than (ordered, signaling) \n
3085 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3086 ///    0x03: Unordered (non-signaling) \n
3087 ///    0x04: Not-equal (unordered, non-signaling) \n
3088 ///    0x05: Not-less-than (unordered, signaling) \n
3089 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3090 ///    0x07: Ordered (non-signaling) \n
3091 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3092 #define _mm_cmp_ps(a, b, c)                                                    \
3093   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3094 
3095 /// Compares each of the corresponding scalar values of two 128-bit
3096 ///    vectors of [4 x float], using the operation specified by the immediate
3097 ///    integer operand.
3098 ///
3099 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3100 ///    If either value in a comparison is NaN, comparisons that are ordered
3101 ///    return false, and comparisons that are unordered return true.
3102 ///
3103 /// \headerfile <x86intrin.h>
3104 ///
3105 /// \code
3106 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3107 /// \endcode
3108 ///
3109 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3110 ///
3111 /// \param a
3112 ///    A 128-bit vector of [4 x float].
3113 /// \param b
3114 ///    A 128-bit vector of [4 x float].
3115 /// \param c
3116 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3117 ///    operation to use: \n
3118 ///    0x00: Equal (ordered, non-signaling) \n
3119 ///    0x01: Less-than (ordered, signaling) \n
3120 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3121 ///    0x03: Unordered (non-signaling) \n
3122 ///    0x04: Not-equal (unordered, non-signaling) \n
3123 ///    0x05: Not-less-than (unordered, signaling) \n
3124 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3125 ///    0x07: Ordered (non-signaling) \n
3126 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3127 #define _mm_cmp_ss(a, b, c)                                                    \
3128   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3129 
3130 #define _MM_ALIGN16 __attribute__((aligned(16)))
3131 
3132 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3133 
3134 #define _MM_EXCEPT_INVALID    (0x0001U)
3135 #define _MM_EXCEPT_DENORM     (0x0002U)
3136 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
3137 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
3138 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
3139 #define _MM_EXCEPT_INEXACT    (0x0020U)
3140 #define _MM_EXCEPT_MASK       (0x003fU)
3141 
3142 #define _MM_MASK_INVALID      (0x0080U)
3143 #define _MM_MASK_DENORM       (0x0100U)
3144 #define _MM_MASK_DIV_ZERO     (0x0200U)
3145 #define _MM_MASK_OVERFLOW     (0x0400U)
3146 #define _MM_MASK_UNDERFLOW    (0x0800U)
3147 #define _MM_MASK_INEXACT      (0x1000U)
3148 #define _MM_MASK_MASK         (0x1f80U)
3149 
3150 #define _MM_ROUND_NEAREST     (0x0000U)
3151 #define _MM_ROUND_DOWN        (0x2000U)
3152 #define _MM_ROUND_UP          (0x4000U)
3153 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3154 #define _MM_ROUND_MASK        (0x6000U)
3155 
3156 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
3157 #define _MM_FLUSH_ZERO_ON     (0x8000U)
3158 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
3159 
3160 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3161 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3162 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3163 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3164 
3165 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3166 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3167 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3168 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3169 
3170 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3171 do { \
3172   __m128 tmp3, tmp2, tmp1, tmp0; \
3173   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3174   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3175   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3176   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3177   (row0) = _mm_movelh_ps(tmp0, tmp2); \
3178   (row1) = _mm_movehl_ps(tmp2, tmp0); \
3179   (row2) = _mm_movelh_ps(tmp1, tmp3); \
3180   (row3) = _mm_movehl_ps(tmp3, tmp1); \
3181 } while (0)
3182 
3183 /* Aliases for compatibility. */
3184 #define _m_pextrw _mm_extract_pi16
3185 #define _m_pinsrw _mm_insert_pi16
3186 #define _m_pmaxsw _mm_max_pi16
3187 #define _m_pmaxub _mm_max_pu8
3188 #define _m_pminsw _mm_min_pi16
3189 #define _m_pminub _mm_min_pu8
3190 #define _m_pmovmskb _mm_movemask_pi8
3191 #define _m_pmulhuw _mm_mulhi_pu16
3192 #define _m_pshufw _mm_shuffle_pi16
3193 #define _m_maskmovq _mm_maskmove_si64
3194 #define _m_pavgb _mm_avg_pu8
3195 #define _m_pavgw _mm_avg_pu16
3196 #define _m_psadbw _mm_sad_pu8
3197 #define _m_ _mm_
3198 
3199 #undef __DEFAULT_FN_ATTRS
3200 #undef __DEFAULT_FN_ATTRS_MMX
3201 
3202 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3203 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3204 #include <emmintrin.h>
3205 #endif
3206 
3207 #endif /* __XMMINTRIN_H */
3208