xref: /freebsd/contrib/llvm-project/clang/lib/Headers/xmmintrin.h (revision 20450c2e792084f06974cff9d2338e2d0406883f)
1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12 
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16 
17 #include <mmintrin.h>
18 
19 typedef float __v4sf __attribute__((__vector_size__(16)));
20 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
21 
22 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
23 
24 /* Unsigned types */
25 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
26 
27 /* This header should only be included in a hosted environment as it depends on
28  * a standard library to provide allocation routines. */
29 #if __STDC_HOSTED__
30 #include <mm_malloc.h>
31 #endif
32 
33 /* Define the default attributes for the functions in this file. */
34 #if defined(__EVEX512__) && !defined(__AVX10_1_512__)
35 #define __DEFAULT_FN_ATTRS                                                     \
36   __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37                  __min_vector_width__(128)))
38 #define __DEFAULT_FN_ATTRS_SSE2                                                \
39   __attribute__((__always_inline__, __nodebug__,                               \
40                  __target__("sse2,no-evex512"), __min_vector_width__(128)))
41 #else
42 #define __DEFAULT_FN_ATTRS                                                     \
43   __attribute__((__always_inline__, __nodebug__, __target__("sse"),            \
44                  __min_vector_width__(128)))
45 #define __DEFAULT_FN_ATTRS_SSE2                                                \
46   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
47                  __min_vector_width__(128)))
48 #endif
49 
50 #if defined(__cplusplus) && (__cplusplus >= 201103L)
51 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
52 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2 constexpr
53 #else
54 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
55 #define __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR __DEFAULT_FN_ATTRS_SSE2
56 #endif
57 
58 #define __trunc64(x)                                                           \
59   (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0)
60 #define __zext128(x)                                                           \
61   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
62                                     1, 2, 3)
63 #define __anyext128(x)                                                         \
64   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
65                                     1, -1, -1)
66 #define __zeroupper64(x)                                                       \
67   (__m128i) __builtin_shufflevector((__v4si)(x), __extension__(__v4si){}, 0,   \
68                                     1, 4, 5)
69 
70 /// Adds the 32-bit float values in the low-order bits of the operands.
71 ///
72 /// \headerfile <x86intrin.h>
73 ///
74 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
75 ///
76 /// \param __a
77 ///    A 128-bit vector of [4 x float] containing one of the source operands.
78 ///    The lower 32 bits of this operand are used in the calculation.
79 /// \param __b
80 ///    A 128-bit vector of [4 x float] containing one of the source operands.
81 ///    The lower 32 bits of this operand are used in the calculation.
82 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
83 ///    of the lower 32 bits of both operands. The upper 96 bits are copied from
84 ///    the upper 96 bits of the first source operand.
85 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_add_ss(__m128 __a,__m128 __b)86 _mm_add_ss(__m128 __a, __m128 __b) {
87   __a[0] += __b[0];
88   return __a;
89 }
90 
91 /// Adds two 128-bit vectors of [4 x float], and returns the results of
92 ///    the addition.
93 ///
94 /// \headerfile <x86intrin.h>
95 ///
96 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
97 ///
98 /// \param __a
99 ///    A 128-bit vector of [4 x float] containing one of the source operands.
100 /// \param __b
101 ///    A 128-bit vector of [4 x float] containing one of the source operands.
102 /// \returns A 128-bit vector of [4 x float] containing the sums of both
103 ///    operands.
104 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_add_ps(__m128 __a,__m128 __b)105 _mm_add_ps(__m128 __a, __m128 __b) {
106   return (__m128)((__v4sf)__a + (__v4sf)__b);
107 }
108 
109 /// Subtracts the 32-bit float value in the low-order bits of the second
110 ///    operand from the corresponding value in the first operand.
111 ///
112 /// \headerfile <x86intrin.h>
113 ///
114 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
115 ///
116 /// \param __a
117 ///    A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
118 ///    of this operand are used in the calculation.
119 /// \param __b
120 ///    A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
121 ///    bits of this operand are used in the calculation.
122 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
123 ///    difference of the lower 32 bits of both operands. The upper 96 bits are
124 ///    copied from the upper 96 bits of the first source operand.
125 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_sub_ss(__m128 __a,__m128 __b)126 _mm_sub_ss(__m128 __a, __m128 __b) {
127   __a[0] -= __b[0];
128   return __a;
129 }
130 
131 /// Subtracts each of the values of the second operand from the first
132 ///    operand, both of which are 128-bit vectors of [4 x float] and returns
133 ///    the results of the subtraction.
134 ///
135 /// \headerfile <x86intrin.h>
136 ///
137 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
138 ///
139 /// \param __a
140 ///    A 128-bit vector of [4 x float] containing the minuend.
141 /// \param __b
142 ///    A 128-bit vector of [4 x float] containing the subtrahend.
143 /// \returns A 128-bit vector of [4 x float] containing the differences between
144 ///    both operands.
145 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_sub_ps(__m128 __a,__m128 __b)146 _mm_sub_ps(__m128 __a, __m128 __b) {
147   return (__m128)((__v4sf)__a - (__v4sf)__b);
148 }
149 
150 /// Multiplies two 32-bit float values in the low-order bits of the
151 ///    operands.
152 ///
153 /// \headerfile <x86intrin.h>
154 ///
155 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
156 ///
157 /// \param __a
158 ///    A 128-bit vector of [4 x float] containing one of the source operands.
159 ///    The lower 32 bits of this operand are used in the calculation.
160 /// \param __b
161 ///    A 128-bit vector of [4 x float] containing one of the source operands.
162 ///    The lower 32 bits of this operand are used in the calculation.
163 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
164 ///    32 bits of both operands. The upper 96 bits are copied from the upper 96
165 ///    bits of the first source operand.
166 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_mul_ss(__m128 __a,__m128 __b)167 _mm_mul_ss(__m128 __a, __m128 __b) {
168   __a[0] *= __b[0];
169   return __a;
170 }
171 
172 /// Multiplies two 128-bit vectors of [4 x float] and returns the
173 ///    results of the multiplication.
174 ///
175 /// \headerfile <x86intrin.h>
176 ///
177 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
178 ///
179 /// \param __a
180 ///    A 128-bit vector of [4 x float] containing one of the source operands.
181 /// \param __b
182 ///    A 128-bit vector of [4 x float] containing one of the source operands.
183 /// \returns A 128-bit vector of [4 x float] containing the products of both
184 ///    operands.
185 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_mul_ps(__m128 __a,__m128 __b)186 _mm_mul_ps(__m128 __a, __m128 __b) {
187   return (__m128)((__v4sf)__a * (__v4sf)__b);
188 }
189 
190 /// Divides the value in the low-order 32 bits of the first operand by
191 ///    the corresponding value in the second operand.
192 ///
193 /// \headerfile <x86intrin.h>
194 ///
195 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
196 ///
197 /// \param __a
198 ///    A 128-bit vector of [4 x float] containing the dividend. The lower 32
199 ///    bits of this operand are used in the calculation.
200 /// \param __b
201 ///    A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
202 ///    of this operand are used in the calculation.
203 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
204 ///    lower 32 bits of both operands. The upper 96 bits are copied from the
205 ///    upper 96 bits of the first source operand.
206 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_div_ss(__m128 __a,__m128 __b)207 _mm_div_ss(__m128 __a, __m128 __b) {
208   __a[0] /= __b[0];
209   return __a;
210 }
211 
212 /// Divides two 128-bit vectors of [4 x float].
213 ///
214 /// \headerfile <x86intrin.h>
215 ///
216 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
217 ///
218 /// \param __a
219 ///    A 128-bit vector of [4 x float] containing the dividend.
220 /// \param __b
221 ///    A 128-bit vector of [4 x float] containing the divisor.
222 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
223 ///    operands.
224 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_div_ps(__m128 __a,__m128 __b)225 _mm_div_ps(__m128 __a, __m128 __b) {
226   return (__m128)((__v4sf)__a / (__v4sf)__b);
227 }
228 
229 /// Calculates the square root of the value stored in the low-order bits
230 ///    of a 128-bit vector of [4 x float].
231 ///
232 /// \headerfile <x86intrin.h>
233 ///
234 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
235 ///
236 /// \param __a
237 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
238 ///    used in the calculation.
239 /// \returns A 128-bit vector of [4 x float] containing the square root of the
240 ///    value in the low-order bits of the operand.
241 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ss(__m128 __a)242 _mm_sqrt_ss(__m128 __a)
243 {
244   return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
245 }
246 
247 /// Calculates the square roots of the values stored in a 128-bit vector
248 ///    of [4 x float].
249 ///
250 /// \headerfile <x86intrin.h>
251 ///
252 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
253 ///
254 /// \param __a
255 ///    A 128-bit vector of [4 x float].
256 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
257 ///    values in the operand.
258 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ps(__m128 __a)259 _mm_sqrt_ps(__m128 __a)
260 {
261   return __builtin_ia32_sqrtps((__v4sf)__a);
262 }
263 
264 /// Calculates the approximate reciprocal of the value stored in the
265 ///    low-order bits of a 128-bit vector of [4 x float].
266 ///
267 /// \headerfile <x86intrin.h>
268 ///
269 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
270 ///
271 /// \param __a
272 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
273 ///    used in the calculation.
274 /// \returns A 128-bit vector of [4 x float] containing the approximate
275 ///    reciprocal of the value in the low-order bits of the operand.
276 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ss(__m128 __a)277 _mm_rcp_ss(__m128 __a)
278 {
279   return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
280 }
281 
282 /// Calculates the approximate reciprocals of the values stored in a
283 ///    128-bit vector of [4 x float].
284 ///
285 /// \headerfile <x86intrin.h>
286 ///
287 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
288 ///
289 /// \param __a
290 ///    A 128-bit vector of [4 x float].
291 /// \returns A 128-bit vector of [4 x float] containing the approximate
292 ///    reciprocals of the values in the operand.
293 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ps(__m128 __a)294 _mm_rcp_ps(__m128 __a)
295 {
296   return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
297 }
298 
299 /// Calculates the approximate reciprocal of the square root of the value
300 ///    stored in the low-order bits of a 128-bit vector of [4 x float].
301 ///
302 /// \headerfile <x86intrin.h>
303 ///
304 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
305 ///
306 /// \param __a
307 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
308 ///    used in the calculation.
309 /// \returns A 128-bit vector of [4 x float] containing the approximate
310 ///    reciprocal of the square root of the value in the low-order bits of the
311 ///    operand.
312 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ss(__m128 __a)313 _mm_rsqrt_ss(__m128 __a)
314 {
315   return __builtin_ia32_rsqrtss((__v4sf)__a);
316 }
317 
318 /// Calculates the approximate reciprocals of the square roots of the
319 ///    values stored in a 128-bit vector of [4 x float].
320 ///
321 /// \headerfile <x86intrin.h>
322 ///
323 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
324 ///
325 /// \param __a
326 ///    A 128-bit vector of [4 x float].
327 /// \returns A 128-bit vector of [4 x float] containing the approximate
328 ///    reciprocals of the square roots of the values in the operand.
329 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ps(__m128 __a)330 _mm_rsqrt_ps(__m128 __a)
331 {
332   return __builtin_ia32_rsqrtps((__v4sf)__a);
333 }
334 
335 /// Compares two 32-bit float values in the low-order bits of both
336 ///    operands and returns the lesser value in the low-order bits of the
337 ///    vector of [4 x float].
338 ///
339 ///    If either value in a comparison is NaN, returns the value from \a __b.
340 ///
341 /// \headerfile <x86intrin.h>
342 ///
343 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
344 ///
345 /// \param __a
346 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
347 ///    32 bits of this operand are used in the comparison.
348 /// \param __b
349 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
350 ///    32 bits of this operand are used in the comparison.
351 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
352 ///    minimum value between both operands. The upper 96 bits are copied from
353 ///    the upper 96 bits of the first source operand.
354 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ss(__m128 __a,__m128 __b)355 _mm_min_ss(__m128 __a, __m128 __b)
356 {
357   return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
358 }
359 
360 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
361 ///    of each pair of values.
362 ///
363 ///    If either value in a comparison is NaN, returns the value from \a __b.
364 ///
365 /// \headerfile <x86intrin.h>
366 ///
367 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
368 ///
369 /// \param __a
370 ///    A 128-bit vector of [4 x float] containing one of the operands.
371 /// \param __b
372 ///    A 128-bit vector of [4 x float] containing one of the operands.
373 /// \returns A 128-bit vector of [4 x float] containing the minimum values
374 ///    between both operands.
375 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ps(__m128 __a,__m128 __b)376 _mm_min_ps(__m128 __a, __m128 __b)
377 {
378   return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
379 }
380 
381 /// Compares two 32-bit float values in the low-order bits of both
382 ///    operands and returns the greater value in the low-order bits of a 128-bit
383 ///    vector of [4 x float].
384 ///
385 ///    If either value in a comparison is NaN, returns the value from \a __b.
386 ///
387 /// \headerfile <x86intrin.h>
388 ///
389 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
390 ///
391 /// \param __a
392 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
393 ///    32 bits of this operand are used in the comparison.
394 /// \param __b
395 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
396 ///    32 bits of this operand are used in the comparison.
397 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
398 ///    maximum value between both operands. The upper 96 bits are copied from
399 ///    the upper 96 bits of the first source operand.
400 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ss(__m128 __a,__m128 __b)401 _mm_max_ss(__m128 __a, __m128 __b)
402 {
403   return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
404 }
405 
406 /// Compares two 128-bit vectors of [4 x float] and returns the greater
407 ///    of each pair of values.
408 ///
409 ///    If either value in a comparison is NaN, returns the value from \a __b.
410 ///
411 /// \headerfile <x86intrin.h>
412 ///
413 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
414 ///
415 /// \param __a
416 ///    A 128-bit vector of [4 x float] containing one of the operands.
417 /// \param __b
418 ///    A 128-bit vector of [4 x float] containing one of the operands.
419 /// \returns A 128-bit vector of [4 x float] containing the maximum values
420 ///    between both operands.
421 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ps(__m128 __a,__m128 __b)422 _mm_max_ps(__m128 __a, __m128 __b)
423 {
424   return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
425 }
426 
427 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
428 ///
429 /// \headerfile <x86intrin.h>
430 ///
431 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
432 ///
433 /// \param __a
434 ///    A 128-bit vector containing one of the source operands.
435 /// \param __b
436 ///    A 128-bit vector containing one of the source operands.
437 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
438 ///    values between both operands.
439 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_and_ps(__m128 __a,__m128 __b)440 _mm_and_ps(__m128 __a, __m128 __b) {
441   return (__m128)((__v4su)__a & (__v4su)__b);
442 }
443 
444 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
445 ///    the one's complement of the values contained in the first source
446 ///    operand.
447 ///
448 /// \headerfile <x86intrin.h>
449 ///
450 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
451 ///
452 /// \param __a
453 ///    A 128-bit vector of [4 x float] containing the first source operand. The
454 ///    one's complement of this value is used in the bitwise AND.
455 /// \param __b
456 ///    A 128-bit vector of [4 x float] containing the second source operand.
457 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
458 ///    one's complement of the first operand and the values in the second
459 ///    operand.
460 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_andnot_ps(__m128 __a,__m128 __b)461 _mm_andnot_ps(__m128 __a, __m128 __b) {
462   return (__m128)(~(__v4su)__a & (__v4su)__b);
463 }
464 
465 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
466 ///
467 /// \headerfile <x86intrin.h>
468 ///
469 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
470 ///
471 /// \param __a
472 ///    A 128-bit vector of [4 x float] containing one of the source operands.
473 /// \param __b
474 ///    A 128-bit vector of [4 x float] containing one of the source operands.
475 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
476 ///    values between both operands.
477 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_or_ps(__m128 __a,__m128 __b)478 _mm_or_ps(__m128 __a, __m128 __b) {
479   return (__m128)((__v4su)__a | (__v4su)__b);
480 }
481 
482 /// Performs a bitwise exclusive OR of two 128-bit vectors of
483 ///    [4 x float].
484 ///
485 /// \headerfile <x86intrin.h>
486 ///
487 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
488 ///
489 /// \param __a
490 ///    A 128-bit vector of [4 x float] containing one of the source operands.
491 /// \param __b
492 ///    A 128-bit vector of [4 x float] containing one of the source operands.
493 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
494 ///    of the values between both operands.
495 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_xor_ps(__m128 __a,__m128 __b)496 _mm_xor_ps(__m128 __a, __m128 __b) {
497   return (__m128)((__v4su)__a ^ (__v4su)__b);
498 }
499 
500 /// Compares two 32-bit float values in the low-order bits of both
501 ///    operands for equality.
502 ///
503 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
504 ///    low-order bits of a vector [4 x float].
505 ///    If either value in a comparison is NaN, returns false.
506 ///
507 /// \headerfile <x86intrin.h>
508 ///
509 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
510 ///
511 /// \param __a
512 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
513 ///    32 bits of this operand are used in the comparison.
514 /// \param __b
515 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
516 ///    32 bits of this operand are used in the comparison.
517 /// \returns A 128-bit vector of [4 x float] containing the comparison results
518 ///    in the low-order bits.
519 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ss(__m128 __a,__m128 __b)520 _mm_cmpeq_ss(__m128 __a, __m128 __b)
521 {
522   return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
523 }
524 
525 /// Compares each of the corresponding 32-bit float values of the
526 ///    128-bit vectors of [4 x float] for equality.
527 ///
528 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
529 ///    If either value in a comparison is NaN, returns false.
530 ///
531 /// \headerfile <x86intrin.h>
532 ///
533 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
534 ///
535 /// \param __a
536 ///    A 128-bit vector of [4 x float].
537 /// \param __b
538 ///    A 128-bit vector of [4 x float].
539 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
540 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ps(__m128 __a,__m128 __b)541 _mm_cmpeq_ps(__m128 __a, __m128 __b)
542 {
543   return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
544 }
545 
546 /// Compares two 32-bit float values in the low-order bits of both
547 ///    operands to determine if the value in the first operand is less than the
548 ///    corresponding value in the second operand.
549 ///
550 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
551 ///    low-order bits of a vector of [4 x float].
552 ///    If either value in a comparison is NaN, returns false.
553 ///
554 /// \headerfile <x86intrin.h>
555 ///
556 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
557 ///
558 /// \param __a
559 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
560 ///    32 bits of this operand are used in the comparison.
561 /// \param __b
562 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
563 ///    32 bits of this operand are used in the comparison.
564 /// \returns A 128-bit vector of [4 x float] containing the comparison results
565 ///    in the low-order bits.
566 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ss(__m128 __a,__m128 __b)567 _mm_cmplt_ss(__m128 __a, __m128 __b)
568 {
569   return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
570 }
571 
572 /// Compares each of the corresponding 32-bit float values of the
573 ///    128-bit vectors of [4 x float] to determine if the values in the first
574 ///    operand are less than those in the second operand.
575 ///
576 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
577 ///    If either value in a comparison is NaN, returns false.
578 ///
579 /// \headerfile <x86intrin.h>
580 ///
581 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
582 ///
583 /// \param __a
584 ///    A 128-bit vector of [4 x float].
585 /// \param __b
586 ///    A 128-bit vector of [4 x float].
587 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
588 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ps(__m128 __a,__m128 __b)589 _mm_cmplt_ps(__m128 __a, __m128 __b)
590 {
591   return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
592 }
593 
594 /// Compares two 32-bit float values in the low-order bits of both
595 ///    operands to determine if the value in the first operand is less than or
596 ///    equal to the corresponding value in the second operand.
597 ///
598 ///    The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
599 ///    the low-order bits of a vector of [4 x float].
600 ///    If either value in a comparison is NaN, returns false.
601 ///
602 /// \headerfile <x86intrin.h>
603 ///
604 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
605 ///
606 /// \param __a
607 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
608 ///    32 bits of this operand are used in the comparison.
609 /// \param __b
610 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
611 ///    32 bits of this operand are used in the comparison.
612 /// \returns A 128-bit vector of [4 x float] containing the comparison results
613 ///    in the low-order bits.
614 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ss(__m128 __a,__m128 __b)615 _mm_cmple_ss(__m128 __a, __m128 __b)
616 {
617   return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
618 }
619 
620 /// Compares each of the corresponding 32-bit float values of the
621 ///    128-bit vectors of [4 x float] to determine if the values in the first
622 ///    operand are less than or equal to those in the second operand.
623 ///
624 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
625 ///    If either value in a comparison is NaN, returns false.
626 ///
627 /// \headerfile <x86intrin.h>
628 ///
629 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
630 ///
631 /// \param __a
632 ///    A 128-bit vector of [4 x float].
633 /// \param __b
634 ///    A 128-bit vector of [4 x float].
635 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
636 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ps(__m128 __a,__m128 __b)637 _mm_cmple_ps(__m128 __a, __m128 __b)
638 {
639   return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
640 }
641 
642 /// Compares two 32-bit float values in the low-order bits of both
643 ///    operands to determine if the value in the first operand is greater than
644 ///    the corresponding value in the second operand.
645 ///
646 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
647 ///    low-order bits of a vector of [4 x float].
648 ///    If either value in a comparison is NaN, returns false.
649 ///
650 /// \headerfile <x86intrin.h>
651 ///
652 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
653 ///
654 /// \param __a
655 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
656 ///    32 bits of this operand are used in the comparison.
657 /// \param __b
658 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
659 ///    32 bits of this operand are used in the comparison.
660 /// \returns A 128-bit vector of [4 x float] containing the comparison results
661 ///    in the low-order bits.
662 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ss(__m128 __a,__m128 __b)663 _mm_cmpgt_ss(__m128 __a, __m128 __b)
664 {
665   return (__m128)__builtin_shufflevector((__v4sf)__a,
666                                          (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
667                                          4, 1, 2, 3);
668 }
669 
670 /// Compares each of the corresponding 32-bit float values of the
671 ///    128-bit vectors of [4 x float] to determine if the values in the first
672 ///    operand are greater than those in the second operand.
673 ///
674 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
675 ///    If either value in a comparison is NaN, returns false.
676 ///
677 /// \headerfile <x86intrin.h>
678 ///
679 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
680 ///
681 /// \param __a
682 ///    A 128-bit vector of [4 x float].
683 /// \param __b
684 ///    A 128-bit vector of [4 x float].
685 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
686 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ps(__m128 __a,__m128 __b)687 _mm_cmpgt_ps(__m128 __a, __m128 __b)
688 {
689   return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
690 }
691 
692 /// Compares two 32-bit float values in the low-order bits of both
693 ///    operands to determine if the value in the first operand is greater than
694 ///    or equal to the corresponding value in the second operand.
695 ///
696 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
697 ///    low-order bits of a vector of [4 x float].
698 ///    If either value in a comparison is NaN, returns false.
699 ///
700 /// \headerfile <x86intrin.h>
701 ///
702 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
703 ///
704 /// \param __a
705 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
706 ///    32 bits of this operand are used in the comparison.
707 /// \param __b
708 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
709 ///    32 bits of this operand are used in the comparison.
710 /// \returns A 128-bit vector of [4 x float] containing the comparison results
711 ///    in the low-order bits.
712 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ss(__m128 __a,__m128 __b)713 _mm_cmpge_ss(__m128 __a, __m128 __b)
714 {
715   return (__m128)__builtin_shufflevector((__v4sf)__a,
716                                          (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
717                                          4, 1, 2, 3);
718 }
719 
720 /// Compares each of the corresponding 32-bit float values of the
721 ///    128-bit vectors of [4 x float] to determine if the values in the first
722 ///    operand are greater than or equal to those in the second operand.
723 ///
724 ///    Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
725 ///    If either value in a comparison is NaN, returns false.
726 ///
727 /// \headerfile <x86intrin.h>
728 ///
729 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
730 ///
731 /// \param __a
732 ///    A 128-bit vector of [4 x float].
733 /// \param __b
734 ///    A 128-bit vector of [4 x float].
735 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
736 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ps(__m128 __a,__m128 __b)737 _mm_cmpge_ps(__m128 __a, __m128 __b)
738 {
739   return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
740 }
741 
742 /// Compares two 32-bit float values in the low-order bits of both operands
743 ///    for inequality.
744 ///
745 ///    The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
746 ///    low-order bits of a vector of [4 x float].
747 ///    If either value in a comparison is NaN, returns true.
748 ///
749 /// \headerfile <x86intrin.h>
750 ///
751 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
752 ///   instructions.
753 ///
754 /// \param __a
755 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
756 ///    32 bits of this operand are used in the comparison.
757 /// \param __b
758 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
759 ///    32 bits of this operand are used in the comparison.
760 /// \returns A 128-bit vector of [4 x float] containing the comparison results
761 ///    in the low-order bits.
762 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ss(__m128 __a,__m128 __b)763 _mm_cmpneq_ss(__m128 __a, __m128 __b)
764 {
765   return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
766 }
767 
768 /// Compares each of the corresponding 32-bit float values of the
769 ///    128-bit vectors of [4 x float] for inequality.
770 ///
771 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
772 ///    If either value in a comparison is NaN, returns true.
773 ///
774 /// \headerfile <x86intrin.h>
775 ///
776 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
777 ///   instructions.
778 ///
779 /// \param __a
780 ///    A 128-bit vector of [4 x float].
781 /// \param __b
782 ///    A 128-bit vector of [4 x float].
783 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
784 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ps(__m128 __a,__m128 __b)785 _mm_cmpneq_ps(__m128 __a, __m128 __b)
786 {
787   return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
788 }
789 
790 /// Compares two 32-bit float values in the low-order bits of both
791 ///    operands to determine if the value in the first operand is not less than
792 ///    the corresponding value in the second operand.
793 ///
794 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
795 ///    low-order bits of a vector of [4 x float].
796 ///    If either value in a comparison is NaN, returns true.
797 ///
798 /// \headerfile <x86intrin.h>
799 ///
800 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
801 ///   instructions.
802 ///
803 /// \param __a
804 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
805 ///    32 bits of this operand are used in the comparison.
806 /// \param __b
807 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
808 ///    32 bits of this operand are used in the comparison.
809 /// \returns A 128-bit vector of [4 x float] containing the comparison results
810 ///    in the low-order bits.
811 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ss(__m128 __a,__m128 __b)812 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
813 {
814   return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
815 }
816 
817 /// Compares each of the corresponding 32-bit float values of the
818 ///    128-bit vectors of [4 x float] to determine if the values in the first
819 ///    operand are not less than those in the second operand.
820 ///
821 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
822 ///    If either value in a comparison is NaN, returns true.
823 ///
824 /// \headerfile <x86intrin.h>
825 ///
826 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
827 ///   instructions.
828 ///
829 /// \param __a
830 ///    A 128-bit vector of [4 x float].
831 /// \param __b
832 ///    A 128-bit vector of [4 x float].
833 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
834 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ps(__m128 __a,__m128 __b)835 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
836 {
837   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
838 }
839 
840 /// Compares two 32-bit float values in the low-order bits of both
841 ///    operands to determine if the value in the first operand is not less than
842 ///    or equal to the corresponding value in the second operand.
843 ///
844 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
845 ///    low-order bits of a vector of [4 x float].
846 ///    If either value in a comparison is NaN, returns true.
847 ///
848 /// \headerfile <x86intrin.h>
849 ///
850 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
851 ///   instructions.
852 ///
853 /// \param __a
854 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
855 ///    32 bits of this operand are used in the comparison.
856 /// \param __b
857 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
858 ///    32 bits of this operand are used in the comparison.
859 /// \returns A 128-bit vector of [4 x float] containing the comparison results
860 ///    in the low-order bits.
861 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ss(__m128 __a,__m128 __b)862 _mm_cmpnle_ss(__m128 __a, __m128 __b)
863 {
864   return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
865 }
866 
867 /// Compares each of the corresponding 32-bit float values of the
868 ///    128-bit vectors of [4 x float] to determine if the values in the first
869 ///    operand are not less than or equal to those in the second operand.
870 ///
871 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
872 ///    If either value in a comparison is NaN, returns true.
873 ///
874 /// \headerfile <x86intrin.h>
875 ///
876 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
877 ///   instructions.
878 ///
879 /// \param __a
880 ///    A 128-bit vector of [4 x float].
881 /// \param __b
882 ///    A 128-bit vector of [4 x float].
883 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
884 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ps(__m128 __a,__m128 __b)885 _mm_cmpnle_ps(__m128 __a, __m128 __b)
886 {
887   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
888 }
889 
890 /// Compares two 32-bit float values in the low-order bits of both
891 ///    operands to determine if the value in the first operand is not greater
892 ///    than the corresponding value in the second operand.
893 ///
894 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
895 ///    low-order bits of a vector of [4 x float].
896 ///    If either value in a comparison is NaN, returns true.
897 ///
898 /// \headerfile <x86intrin.h>
899 ///
900 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
901 ///   instructions.
902 ///
903 /// \param __a
904 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
905 ///    32 bits of this operand are used in the comparison.
906 /// \param __b
907 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
908 ///    32 bits of this operand are used in the comparison.
909 /// \returns A 128-bit vector of [4 x float] containing the comparison results
910 ///    in the low-order bits.
911 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ss(__m128 __a,__m128 __b)912 _mm_cmpngt_ss(__m128 __a, __m128 __b)
913 {
914   return (__m128)__builtin_shufflevector((__v4sf)__a,
915                                          (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
916                                          4, 1, 2, 3);
917 }
918 
919 /// Compares each of the corresponding 32-bit float values of the
920 ///    128-bit vectors of [4 x float] to determine if the values in the first
921 ///    operand are not greater than those in the second operand.
922 ///
923 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
924 ///    If either value in a comparison is NaN, returns true.
925 ///
926 /// \headerfile <x86intrin.h>
927 ///
928 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
929 ///   instructions.
930 ///
931 /// \param __a
932 ///    A 128-bit vector of [4 x float].
933 /// \param __b
934 ///    A 128-bit vector of [4 x float].
935 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
936 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ps(__m128 __a,__m128 __b)937 _mm_cmpngt_ps(__m128 __a, __m128 __b)
938 {
939   return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
940 }
941 
942 /// Compares two 32-bit float values in the low-order bits of both
943 ///    operands to determine if the value in the first operand is not greater
944 ///    than or equal to the corresponding value in the second operand.
945 ///
946 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
947 ///    low-order bits of a vector of [4 x float].
948 ///    If either value in a comparison is NaN, returns true.
949 ///
950 /// \headerfile <x86intrin.h>
951 ///
952 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
953 ///   instructions.
954 ///
955 /// \param __a
956 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
957 ///    32 bits of this operand are used in the comparison.
958 /// \param __b
959 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
960 ///    32 bits of this operand are used in the comparison.
961 /// \returns A 128-bit vector of [4 x float] containing the comparison results
962 ///    in the low-order bits.
963 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ss(__m128 __a,__m128 __b)964 _mm_cmpnge_ss(__m128 __a, __m128 __b)
965 {
966   return (__m128)__builtin_shufflevector((__v4sf)__a,
967                                          (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
968                                          4, 1, 2, 3);
969 }
970 
971 /// Compares each of the corresponding 32-bit float values of the
972 ///    128-bit vectors of [4 x float] to determine if the values in the first
973 ///    operand are not greater than or equal to those in the second operand.
974 ///
975 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
976 ///    If either value in a comparison is NaN, returns true.
977 ///
978 /// \headerfile <x86intrin.h>
979 ///
980 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
981 ///   instructions.
982 ///
983 /// \param __a
984 ///    A 128-bit vector of [4 x float].
985 /// \param __b
986 ///    A 128-bit vector of [4 x float].
987 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
988 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ps(__m128 __a,__m128 __b)989 _mm_cmpnge_ps(__m128 __a, __m128 __b)
990 {
991   return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
992 }
993 
994 /// Compares two 32-bit float values in the low-order bits of both
995 ///    operands to determine if the value in the first operand is ordered with
996 ///    respect to the corresponding value in the second operand.
997 ///
998 ///    A pair of floating-point values are ordered with respect to each
999 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1000 ///    0xFFFFFFFF for true.
1001 ///
1002 /// \headerfile <x86intrin.h>
1003 ///
1004 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
1005 ///   instructions.
1006 ///
1007 /// \param __a
1008 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1009 ///    32 bits of this operand are used in the comparison.
1010 /// \param __b
1011 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1012 ///    32 bits of this operand are used in the comparison.
1013 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1014 ///    in the low-order bits.
1015 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ss(__m128 __a,__m128 __b)1016 _mm_cmpord_ss(__m128 __a, __m128 __b)
1017 {
1018   return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1019 }
1020 
1021 /// Compares each of the corresponding 32-bit float values of the
1022 ///    128-bit vectors of [4 x float] to determine if the values in the first
1023 ///    operand are ordered with respect to those in the second operand.
1024 ///
1025 ///    A pair of floating-point values are ordered with respect to each
1026 ///    other if neither value is a NaN. Each comparison returns 0x0 for false,
1027 ///    0xFFFFFFFF for true.
1028 ///
1029 /// \headerfile <x86intrin.h>
1030 ///
1031 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1032 ///   instructions.
1033 ///
1034 /// \param __a
1035 ///    A 128-bit vector of [4 x float].
1036 /// \param __b
1037 ///    A 128-bit vector of [4 x float].
1038 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1039 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ps(__m128 __a,__m128 __b)1040 _mm_cmpord_ps(__m128 __a, __m128 __b)
1041 {
1042   return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1043 }
1044 
1045 /// Compares two 32-bit float values in the low-order bits of both
1046 ///    operands to determine if the value in the first operand is unordered
1047 ///    with respect to the corresponding value in the second operand.
1048 ///
1049 ///    A pair of double-precision values are unordered with respect to each
1050 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1051 ///    false, 0xFFFFFFFF for true.
1052 ///
1053 /// \headerfile <x86intrin.h>
1054 ///
1055 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1056 ///   instructions.
1057 ///
1058 /// \param __a
1059 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1060 ///    32 bits of this operand are used in the comparison.
1061 /// \param __b
1062 ///    A 128-bit vector of [4 x float] containing one of the operands. The lower
1063 ///    32 bits of this operand are used in the comparison.
1064 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1065 ///    in the low-order bits.
1066 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ss(__m128 __a,__m128 __b)1067 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1068 {
1069   return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1070 }
1071 
1072 /// Compares each of the corresponding 32-bit float values of the
1073 ///    128-bit vectors of [4 x float] to determine if the values in the first
1074 ///    operand are unordered with respect to those in the second operand.
1075 ///
1076 ///    A pair of double-precision values are unordered with respect to each
1077 ///    other if one or both values are NaN. Each comparison returns 0x0 for
1078 ///    false, 0xFFFFFFFFFFFFFFFF for true.
1079 ///
1080 /// \headerfile <x86intrin.h>
1081 ///
1082 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1083 ///   instructions.
1084 ///
1085 /// \param __a
1086 ///    A 128-bit vector of [4 x float].
1087 /// \param __b
1088 ///    A 128-bit vector of [4 x float].
1089 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1090 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ps(__m128 __a,__m128 __b)1091 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1092 {
1093   return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1094 }
1095 
1096 /// Compares two 32-bit float values in the low-order bits of both
1097 ///    operands for equality.
1098 ///
1099 ///    The comparison returns 0 for false, 1 for true. If either value in a
1100 ///    comparison is NaN, returns 0.
1101 ///
1102 /// \headerfile <x86intrin.h>
1103 ///
1104 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1105 ///   instructions.
1106 ///
1107 /// \param __a
1108 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1109 ///    used in the comparison.
1110 /// \param __b
1111 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1112 ///    used in the comparison.
1113 /// \returns An integer containing the comparison results.
1114 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_ss(__m128 __a,__m128 __b)1115 _mm_comieq_ss(__m128 __a, __m128 __b)
1116 {
1117   return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1118 }
1119 
1120 /// Compares two 32-bit float values in the low-order bits of both
1121 ///    operands to determine if the first operand is less than the second
1122 ///    operand.
1123 ///
1124 ///    The comparison returns 0 for false, 1 for true. If either value in a
1125 ///    comparison is NaN, returns 0.
1126 ///
1127 /// \headerfile <x86intrin.h>
1128 ///
1129 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1130 ///   instructions.
1131 ///
1132 /// \param __a
1133 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1134 ///    used in the comparison.
1135 /// \param __b
1136 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1137 ///    used in the comparison.
1138 /// \returns An integer containing the comparison results.
1139 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_ss(__m128 __a,__m128 __b)1140 _mm_comilt_ss(__m128 __a, __m128 __b)
1141 {
1142   return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1143 }
1144 
1145 /// Compares two 32-bit float values in the low-order bits of both
1146 ///    operands to determine if the first operand is less than or equal to the
1147 ///    second operand.
1148 ///
1149 ///    The comparison returns 0 for false, 1 for true. If either value in a
1150 ///    comparison is NaN, returns 0.
1151 ///
1152 /// \headerfile <x86intrin.h>
1153 ///
1154 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1155 ///
1156 /// \param __a
1157 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1158 ///    used in the comparison.
1159 /// \param __b
1160 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1161 ///    used in the comparison.
1162 /// \returns An integer containing the comparison results.
1163 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_ss(__m128 __a,__m128 __b)1164 _mm_comile_ss(__m128 __a, __m128 __b)
1165 {
1166   return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1167 }
1168 
1169 /// Compares two 32-bit float values in the low-order bits of both
1170 ///    operands to determine if the first operand is greater than the second
1171 ///    operand.
1172 ///
1173 ///    The comparison returns 0 for false, 1 for true. If either value in a
1174 ///    comparison is NaN, returns 0.
1175 ///
1176 /// \headerfile <x86intrin.h>
1177 ///
1178 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1179 ///
1180 /// \param __a
1181 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1182 ///    used in the comparison.
1183 /// \param __b
1184 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1185 ///    used in the comparison.
1186 /// \returns An integer containing the comparison results.
1187 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_ss(__m128 __a,__m128 __b)1188 _mm_comigt_ss(__m128 __a, __m128 __b)
1189 {
1190   return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1191 }
1192 
1193 /// Compares two 32-bit float values in the low-order bits of both
1194 ///    operands to determine if the first operand is greater than or equal to
1195 ///    the second operand.
1196 ///
1197 ///    The comparison returns 0 for false, 1 for true. If either value in a
1198 ///    comparison is NaN, returns 0.
1199 ///
1200 /// \headerfile <x86intrin.h>
1201 ///
1202 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1203 ///
1204 /// \param __a
1205 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1206 ///    used in the comparison.
1207 /// \param __b
1208 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1209 ///    used in the comparison.
1210 /// \returns An integer containing the comparison results.
1211 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_ss(__m128 __a,__m128 __b)1212 _mm_comige_ss(__m128 __a, __m128 __b)
1213 {
1214   return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1215 }
1216 
1217 /// Compares two 32-bit float values in the low-order bits of both
1218 ///    operands to determine if the first operand is not equal to the second
1219 ///    operand.
1220 ///
1221 ///    The comparison returns 0 for false, 1 for true. If either value in a
1222 ///    comparison is NaN, returns 1.
1223 ///
1224 /// \headerfile <x86intrin.h>
1225 ///
1226 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1227 ///
1228 /// \param __a
1229 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1230 ///    used in the comparison.
1231 /// \param __b
1232 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1233 ///    used in the comparison.
1234 /// \returns An integer containing the comparison results.
1235 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_ss(__m128 __a,__m128 __b)1236 _mm_comineq_ss(__m128 __a, __m128 __b)
1237 {
1238   return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1239 }
1240 
1241 /// Performs an unordered comparison of two 32-bit float values using
1242 ///    the low-order bits of both operands to determine equality.
1243 ///
1244 ///    The comparison returns 0 for false, 1 for true. If either value in a
1245 ///    comparison is NaN, returns 0.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1250 ///
1251 /// \param __a
1252 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1253 ///    used in the comparison.
1254 /// \param __b
1255 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1256 ///    used in the comparison.
1257 /// \returns An integer containing the comparison results.
1258 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_ss(__m128 __a,__m128 __b)1259 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1260 {
1261   return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1262 }
1263 
1264 /// Performs an unordered comparison of two 32-bit float values using
1265 ///    the low-order bits of both operands to determine if the first operand is
1266 ///    less than the second operand.
1267 ///
1268 ///    The comparison returns 0 for false, 1 for true. If either value in a
1269 ///    comparison is NaN, returns 0.
1270 ///
1271 /// \headerfile <x86intrin.h>
1272 ///
1273 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1274 ///
1275 /// \param __a
1276 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1277 ///    used in the comparison.
1278 /// \param __b
1279 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1280 ///    used in the comparison.
1281 /// \returns An integer containing the comparison results.
1282 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_ss(__m128 __a,__m128 __b)1283 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1284 {
1285   return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1286 }
1287 
1288 /// Performs an unordered comparison of two 32-bit float values using
1289 ///    the low-order bits of both operands to determine if the first operand is
1290 ///    less than or equal to the second operand.
1291 ///
1292 ///    The comparison returns 0 for false, 1 for true. If either value in a
1293 ///    comparison is NaN, returns 0.
1294 ///
1295 /// \headerfile <x86intrin.h>
1296 ///
1297 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1298 ///
1299 /// \param __a
1300 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1301 ///    used in the comparison.
1302 /// \param __b
1303 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1304 ///    used in the comparison.
1305 /// \returns An integer containing the comparison results.
1306 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_ss(__m128 __a,__m128 __b)1307 _mm_ucomile_ss(__m128 __a, __m128 __b)
1308 {
1309   return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1310 }
1311 
1312 /// Performs an unordered comparison of two 32-bit float values using
1313 ///    the low-order bits of both operands to determine if the first operand is
1314 ///    greater than the second operand.
1315 ///
1316 ///    The comparison returns 0 for false, 1 for true. If either value in a
1317 ///    comparison is NaN, returns 0.
1318 ///
1319 /// \headerfile <x86intrin.h>
1320 ///
1321 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1322 ///
1323 /// \param __a
1324 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1325 ///    used in the comparison.
1326 /// \param __b
1327 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1328 ///    used in the comparison.
1329 /// \returns An integer containing the comparison results.
1330 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_ss(__m128 __a,__m128 __b)1331 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1332 {
1333   return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1334 }
1335 
1336 /// Performs an unordered comparison of two 32-bit float values using
1337 ///    the low-order bits of both operands to determine if the first operand is
1338 ///    greater than or equal to the second operand.
1339 ///
1340 ///    The comparison returns 0 for false, 1 for true. If either value in a
1341 ///    comparison is NaN, returns 0.
1342 ///
1343 /// \headerfile <x86intrin.h>
1344 ///
1345 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1346 ///
1347 /// \param __a
1348 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1349 ///    used in the comparison.
1350 /// \param __b
1351 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1352 ///    used in the comparison.
1353 /// \returns An integer containing the comparison results.
1354 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_ss(__m128 __a,__m128 __b)1355 _mm_ucomige_ss(__m128 __a, __m128 __b)
1356 {
1357   return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1358 }
1359 
1360 /// Performs an unordered comparison of two 32-bit float values using
1361 ///    the low-order bits of both operands to determine inequality.
1362 ///
1363 ///    The comparison returns 0 for false, 1 for true. If either value in a
1364 ///    comparison is NaN, returns 0.
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1369 ///
1370 /// \param __a
1371 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1372 ///    used in the comparison.
1373 /// \param __b
1374 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1375 ///    used in the comparison.
1376 /// \returns An integer containing the comparison results.
1377 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_ss(__m128 __a,__m128 __b)1378 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1379 {
1380   return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1381 }
1382 
1383 /// Converts a float value contained in the lower 32 bits of a vector of
1384 ///    [4 x float] into a 32-bit integer.
1385 ///
1386 ///    If the converted value does not fit in a 32-bit integer, raises a
1387 ///    floating-point invalid exception. If the exception is masked, returns
1388 ///    the most negative integer.
1389 ///
1390 /// \headerfile <x86intrin.h>
1391 ///
1392 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1393 ///   instructions.
1394 ///
1395 /// \param __a
1396 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1397 ///    used in the conversion.
1398 /// \returns A 32-bit integer containing the converted value.
1399 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtss_si32(__m128 __a)1400 _mm_cvtss_si32(__m128 __a)
1401 {
1402   return __builtin_ia32_cvtss2si((__v4sf)__a);
1403 }
1404 
1405 /// Converts a float value contained in the lower 32 bits of a vector of
1406 ///    [4 x float] into a 32-bit integer.
1407 ///
1408 ///    If the converted value does not fit in a 32-bit integer, raises a
1409 ///    floating-point invalid exception. If the exception is masked, returns
1410 ///    the most negative integer.
1411 ///
1412 /// \headerfile <x86intrin.h>
1413 ///
1414 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1415 ///   instructions.
1416 ///
1417 /// \param __a
1418 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1419 ///    used in the conversion.
1420 /// \returns A 32-bit integer containing the converted value.
1421 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvt_ss2si(__m128 __a)1422 _mm_cvt_ss2si(__m128 __a)
1423 {
1424   return _mm_cvtss_si32(__a);
1425 }
1426 
1427 #ifdef __x86_64__
1428 
1429 /// Converts a float value contained in the lower 32 bits of a vector of
1430 ///    [4 x float] into a 64-bit integer.
1431 ///
1432 ///    If the converted value does not fit in a 32-bit integer, raises a
1433 ///    floating-point invalid exception. If the exception is masked, returns
1434 ///    the most negative integer.
1435 ///
1436 /// \headerfile <x86intrin.h>
1437 ///
1438 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1439 ///   instructions.
1440 ///
1441 /// \param __a
1442 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1443 ///    used in the conversion.
1444 /// \returns A 64-bit integer containing the converted value.
1445 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtss_si64(__m128 __a)1446 _mm_cvtss_si64(__m128 __a)
1447 {
1448   return __builtin_ia32_cvtss2si64((__v4sf)__a);
1449 }
1450 
1451 #endif
1452 
1453 /// Converts two low-order float values in a 128-bit vector of
1454 ///    [4 x float] into a 64-bit vector of [2 x i32].
1455 ///
1456 ///    If a converted value does not fit in a 32-bit integer, raises a
1457 ///    floating-point invalid exception. If the exception is masked, returns
1458 ///    the most negative integer.
1459 ///
1460 /// \headerfile <x86intrin.h>
1461 ///
1462 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1463 ///
1464 /// \param __a
1465 ///    A 128-bit vector of [4 x float].
1466 /// \returns A 64-bit integer vector containing the converted values.
1467 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi32(__m128 __a)1468 _mm_cvtps_pi32(__m128 __a)
1469 {
1470   return __trunc64(__builtin_ia32_cvtps2dq((__v4sf)__zeroupper64(__a)));
1471 }
1472 
1473 /// Converts two low-order float values in a 128-bit vector of
1474 ///    [4 x float] into a 64-bit vector of [2 x i32].
1475 ///
1476 ///    If a converted value does not fit in a 32-bit integer, raises a
1477 ///    floating-point invalid exception. If the exception is masked, returns
1478 ///    the most negative integer.
1479 ///
1480 /// \headerfile <x86intrin.h>
1481 ///
1482 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1483 ///
1484 /// \param __a
1485 ///    A 128-bit vector of [4 x float].
1486 /// \returns A 64-bit integer vector containing the converted values.
1487 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvt_ps2pi(__m128 __a)1488 _mm_cvt_ps2pi(__m128 __a)
1489 {
1490   return _mm_cvtps_pi32(__a);
1491 }
1492 
1493 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1494 ///    truncated (rounded toward zero) 32-bit integer.
1495 ///
1496 ///    If the converted value does not fit in a 32-bit integer, raises a
1497 ///    floating-point invalid exception. If the exception is masked, returns
1498 ///    the most negative integer.
1499 ///
1500 /// \headerfile <x86intrin.h>
1501 ///
1502 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1503 ///   instructions.
1504 ///
1505 /// \param __a
1506 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1507 ///    used in the conversion.
1508 /// \returns A 32-bit integer containing the converted value.
1509 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttss_si32(__m128 __a)1510 _mm_cvttss_si32(__m128 __a)
1511 {
1512   return __builtin_ia32_cvttss2si((__v4sf)__a);
1513 }
1514 
1515 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1516 ///    truncated (rounded toward zero) 32-bit integer.
1517 ///
1518 ///    If the converted value does not fit in a 32-bit integer, raises a
1519 ///    floating-point invalid exception. If the exception is masked, returns
1520 ///    the most negative integer.
1521 ///
1522 /// \headerfile <x86intrin.h>
1523 ///
1524 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1525 ///   instructions.
1526 ///
1527 /// \param __a
1528 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1529 ///    used in the conversion.
1530 /// \returns A 32-bit integer containing the converted value.
1531 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtt_ss2si(__m128 __a)1532 _mm_cvtt_ss2si(__m128 __a)
1533 {
1534   return _mm_cvttss_si32(__a);
1535 }
1536 
1537 #ifdef __x86_64__
1538 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1539 ///    truncated (rounded toward zero) 64-bit integer.
1540 ///
1541 ///    If the converted value does not fit in a 64-bit integer, raises a
1542 ///    floating-point invalid exception. If the exception is masked, returns
1543 ///    the most negative integer.
1544 ///
1545 /// \headerfile <x86intrin.h>
1546 ///
1547 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1548 ///   instructions.
1549 ///
1550 /// \param __a
1551 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1552 ///    used in the conversion.
1553 /// \returns A 64-bit integer containing the converted value.
1554 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttss_si64(__m128 __a)1555 _mm_cvttss_si64(__m128 __a)
1556 {
1557   return __builtin_ia32_cvttss2si64((__v4sf)__a);
1558 }
1559 #endif
1560 
1561 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1562 ///    into two signed truncated (rounded toward zero) 32-bit integers,
1563 ///    returned in a 64-bit vector of [2 x i32].
1564 ///
1565 ///    If a converted value does not fit in a 32-bit integer, raises a
1566 ///    floating-point invalid exception. If the exception is masked, returns
1567 ///    the most negative integer.
1568 ///
1569 /// \headerfile <x86intrin.h>
1570 ///
1571 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1572 ///   instructions.
1573 ///
1574 /// \param __a
1575 ///    A 128-bit vector of [4 x float].
1576 /// \returns A 64-bit integer vector containing the converted values.
1577 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvttps_pi32(__m128 __a)1578 _mm_cvttps_pi32(__m128 __a)
1579 {
1580   return __trunc64(__builtin_ia32_cvttps2dq((__v4sf)__zeroupper64(__a)));
1581 }
1582 
1583 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1584 ///    into two signed truncated (rounded toward zero) 64-bit integers,
1585 ///    returned in a 64-bit vector of [2 x i32].
1586 ///
1587 ///    If a converted value does not fit in a 32-bit integer, raises a
1588 ///    floating-point invalid exception. If the exception is masked, returns
1589 ///    the most negative integer.
1590 ///
1591 /// \headerfile <x86intrin.h>
1592 ///
1593 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1594 ///
1595 /// \param __a
1596 ///    A 128-bit vector of [4 x float].
1597 /// \returns A 64-bit integer vector containing the converted values.
1598 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtt_ps2pi(__m128 __a)1599 _mm_cvtt_ps2pi(__m128 __a)
1600 {
1601   return _mm_cvttps_pi32(__a);
1602 }
1603 
1604 /// Converts a 32-bit signed integer value into a floating point value
1605 ///    and writes it to the lower 32 bits of the destination. The remaining
1606 ///    higher order elements of the destination vector are copied from the
1607 ///    corresponding elements in the first operand.
1608 ///
1609 /// \headerfile <x86intrin.h>
1610 ///
1611 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1612 ///
1613 /// \param __a
1614 ///    A 128-bit vector of [4 x float].
1615 /// \param __b
1616 ///    A 32-bit signed integer operand containing the value to be converted.
1617 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1618 ///    converted value of the second operand. The upper 96 bits are copied from
1619 ///    the upper 96 bits of the first operand.
_mm_cvtsi32_ss(__m128 __a,int __b)1620 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvtsi32_ss(__m128 __a,
1621                                                                      int __b) {
1622   __a[0] = __b;
1623   return __a;
1624 }
1625 
1626 /// Converts a 32-bit signed integer value into a floating point value
1627 ///    and writes it to the lower 32 bits of the destination. The remaining
1628 ///    higher order elements of the destination are copied from the
1629 ///    corresponding elements in the first operand.
1630 ///
1631 /// \headerfile <x86intrin.h>
1632 ///
1633 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1634 ///
1635 /// \param __a
1636 ///    A 128-bit vector of [4 x float].
1637 /// \param __b
1638 ///    A 32-bit signed integer operand containing the value to be converted.
1639 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1640 ///    converted value of the second operand. The upper 96 bits are copied from
1641 ///    the upper 96 bits of the first operand.
_mm_cvt_si2ss(__m128 __a,int __b)1642 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cvt_si2ss(__m128 __a,
1643                                                                     int __b) {
1644   return _mm_cvtsi32_ss(__a, __b);
1645 }
1646 
1647 #ifdef __x86_64__
1648 
1649 /// Converts a 64-bit signed integer value into a floating point value
1650 ///    and writes it to the lower 32 bits of the destination. The remaining
1651 ///    higher order elements of the destination are copied from the
1652 ///    corresponding elements in the first operand.
1653 ///
1654 /// \headerfile <x86intrin.h>
1655 ///
1656 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1657 ///
1658 /// \param __a
1659 ///    A 128-bit vector of [4 x float].
1660 /// \param __b
1661 ///    A 64-bit signed integer operand containing the value to be converted.
1662 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1663 ///    converted value of the second operand. The upper 96 bits are copied from
1664 ///    the upper 96 bits of the first operand.
1665 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_cvtsi64_ss(__m128 __a,long long __b)1666 _mm_cvtsi64_ss(__m128 __a, long long __b) {
1667   __a[0] = __b;
1668   return __a;
1669 }
1670 
1671 #endif
1672 
1673 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1674 ///    floating point values and writes them to the lower 64-bits of the
1675 ///    destination. The remaining higher order elements of the destination are
1676 ///    copied from the corresponding elements in the first operand.
1677 ///
1678 /// \headerfile <x86intrin.h>
1679 ///
1680 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1681 ///
1682 /// \param __a
1683 ///    A 128-bit vector of [4 x float].
1684 /// \param __b
1685 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1686 ///    and written to the corresponding low-order elements in the destination.
1687 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1688 ///    converted value of the second operand. The upper 64 bits are copied from
1689 ///    the upper 64 bits of the first operand.
1690 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi32_ps(__m128 __a,__m64 __b)1691 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1692 {
1693   return (__m128)__builtin_shufflevector(
1694       (__v4sf)__a,
1695       __builtin_convertvector((__v4si)__zext128(__b), __v4sf),
1696       4, 5, 2, 3);
1697 }
1698 
1699 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1700 ///    floating point values and writes them to the lower 64-bits of the
1701 ///    destination. The remaining higher order elements of the destination are
1702 ///    copied from the corresponding elements in the first operand.
1703 ///
1704 /// \headerfile <x86intrin.h>
1705 ///
1706 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1707 ///
1708 /// \param __a
1709 ///    A 128-bit vector of [4 x float].
1710 /// \param __b
1711 ///    A 64-bit vector of [2 x i32]. The elements in this vector are converted
1712 ///    and written to the corresponding low-order elements in the destination.
1713 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1714 ///    converted value from the second operand. The upper 64 bits are copied
1715 ///    from the upper 64 bits of the first operand.
1716 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvt_pi2ps(__m128 __a,__m64 __b)1717 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1718 {
1719   return _mm_cvtpi32_ps(__a, __b);
1720 }
1721 
1722 /// Extracts a float value contained in the lower 32 bits of a vector of
1723 ///    [4 x float].
1724 ///
1725 /// \headerfile <x86intrin.h>
1726 ///
1727 /// This intrinsic has no corresponding instruction.
1728 ///
1729 /// \param __a
1730 ///    A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1731 ///    used in the extraction.
1732 /// \returns A 32-bit float containing the extracted value.
1733 static __inline__ float __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_cvtss_f32(__m128 __a)1734 _mm_cvtss_f32(__m128 __a) {
1735   return __a[0];
1736 }
1737 
1738 /// Loads two packed float values from the address \a __p into the
1739 ///     high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1740 ///     are copied from the low-order bits of the first operand.
1741 ///
1742 /// \headerfile <x86intrin.h>
1743 ///
1744 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1745 ///
1746 /// \param __a
1747 ///    A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1748 ///    of the destination.
1749 /// \param __p
1750 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1751 ///    [127:64] of the destination.
1752 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1753 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadh_pi(__m128 __a,const __m64 * __p)1754 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1755 {
1756   typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1757   struct __mm_loadh_pi_struct {
1758     __mm_loadh_pi_v2f32 __u;
1759   } __attribute__((__packed__, __may_alias__));
1760   __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1761   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1762   return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1763 }
1764 
1765 /// Loads two packed float values from the address \a __p into the
1766 ///    low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1767 ///    are copied from the high-order bits of the first operand.
1768 ///
1769 /// \headerfile <x86intrin.h>
1770 ///
1771 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1772 ///
1773 /// \param __a
1774 ///    A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1775 ///    [127:64] of the destination.
1776 /// \param __p
1777 ///    A pointer to two packed float values. Bits [63:0] are written to bits
1778 ///    [63:0] of the destination.
1779 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1780 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadl_pi(__m128 __a,const __m64 * __p)1781 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1782 {
1783   typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1784   struct __mm_loadl_pi_struct {
1785     __mm_loadl_pi_v2f32 __u;
1786   } __attribute__((__packed__, __may_alias__));
1787   __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1788   __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1789   return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1790 }
1791 
1792 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1793 ///    32 bits of the vector are initialized with the single-precision
1794 ///    floating-point value loaded from a specified memory location. The upper
1795 ///    96 bits are set to zero.
1796 ///
1797 /// \headerfile <x86intrin.h>
1798 ///
1799 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1800 ///
1801 /// \param __p
1802 ///    A pointer to a 32-bit memory location containing a single-precision
1803 ///    floating-point value.
1804 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1805 ///    lower 32 bits contain the value loaded from the memory location. The
1806 ///    upper 96 bits are set to zero.
1807 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ss(const float * __p)1808 _mm_load_ss(const float *__p)
1809 {
1810   struct __mm_load_ss_struct {
1811     float __u;
1812   } __attribute__((__packed__, __may_alias__));
1813   float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1814   return __extension__ (__m128){ __u, 0, 0, 0 };
1815 }
1816 
1817 /// Loads a 32-bit float value and duplicates it to all four vector
1818 ///    elements of a 128-bit vector of [4 x float].
1819 ///
1820 /// \headerfile <x86intrin.h>
1821 ///
1822 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1823 ///    instruction.
1824 ///
1825 /// \param __p
1826 ///    A pointer to a float value to be loaded and duplicated.
1827 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1828 ///    duplicated values.
1829 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load1_ps(const float * __p)1830 _mm_load1_ps(const float *__p)
1831 {
1832   struct __mm_load1_ps_struct {
1833     float __u;
1834   } __attribute__((__packed__, __may_alias__));
1835   float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1836   return __extension__ (__m128){ __u, __u, __u, __u };
1837 }
1838 
1839 #define        _mm_load_ps1(p) _mm_load1_ps(p)
1840 
1841 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1842 ///    memory location.
1843 ///
1844 /// \headerfile <x86intrin.h>
1845 ///
1846 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1847 ///
1848 /// \param __p
1849 ///    A pointer to a 128-bit memory location. The address of the memory
1850 ///    location has to be 128-bit aligned.
1851 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1852 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ps(const float * __p)1853 _mm_load_ps(const float *__p)
1854 {
1855   return *(const __m128*)__p;
1856 }
1857 
1858 /// Loads a 128-bit floating-point vector of [4 x float] from an
1859 ///    unaligned memory location.
1860 ///
1861 /// \headerfile <x86intrin.h>
1862 ///
1863 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1864 ///
1865 /// \param __p
1866 ///    A pointer to a 128-bit memory location. The address of the memory
1867 ///    location does not have to be aligned.
1868 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1869 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadu_ps(const float * __p)1870 _mm_loadu_ps(const float *__p)
1871 {
1872   struct __loadu_ps {
1873     __m128_u __v;
1874   } __attribute__((__packed__, __may_alias__));
1875   return ((const struct __loadu_ps*)__p)->__v;
1876 }
1877 
1878 /// Loads four packed float values, in reverse order, from an aligned
1879 ///    memory location to 32-bit elements in a 128-bit vector of [4 x float].
1880 ///
1881 /// \headerfile <x86intrin.h>
1882 ///
1883 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1884 ///    instruction.
1885 ///
1886 /// \param __p
1887 ///    A pointer to a 128-bit memory location. The address of the memory
1888 ///    location has to be 128-bit aligned.
1889 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1890 ///    in reverse order.
1891 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadr_ps(const float * __p)1892 _mm_loadr_ps(const float *__p)
1893 {
1894   __m128 __a = _mm_load_ps(__p);
1895   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1896 }
1897 
1898 /// Create a 128-bit vector of [4 x float] with undefined values.
1899 ///
1900 /// \headerfile <x86intrin.h>
1901 ///
1902 /// This intrinsic has no corresponding instruction.
1903 ///
1904 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1905 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)1906 _mm_undefined_ps(void)
1907 {
1908   return (__m128)__builtin_ia32_undef128();
1909 }
1910 
1911 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1912 ///    32 bits of the vector are initialized with the specified single-precision
1913 ///    floating-point value. The upper 96 bits are set to zero.
1914 ///
1915 /// \headerfile <x86intrin.h>
1916 ///
1917 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1918 ///
1919 /// \param __w
1920 ///    A single-precision floating-point value used to initialize the lower 32
1921 ///    bits of the result.
1922 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1923 ///    lower 32 bits contain the value provided in the source operand. The
1924 ///    upper 96 bits are set to zero.
1925 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_set_ss(float __w)1926 _mm_set_ss(float __w) {
1927   return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1928 }
1929 
1930 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1931 ///    of the four single-precision floating-point vector elements set to the
1932 ///    specified single-precision floating-point value.
1933 ///
1934 /// \headerfile <x86intrin.h>
1935 ///
1936 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1937 ///
1938 /// \param __w
1939 ///    A single-precision floating-point value used to initialize each vector
1940 ///    element of the result.
1941 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1942 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_set1_ps(float __w)1943 _mm_set1_ps(float __w) {
1944   return __extension__ (__m128){ __w, __w, __w, __w };
1945 }
1946 
1947 /* Microsoft specific. */
1948 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1949 ///    of the four single-precision floating-point vector elements set to the
1950 ///    specified single-precision floating-point value.
1951 ///
1952 /// \headerfile <x86intrin.h>
1953 ///
1954 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1955 ///
1956 /// \param __w
1957 ///    A single-precision floating-point value used to initialize each vector
1958 ///    element of the result.
1959 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1960 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_set_ps1(float __w)1961 _mm_set_ps1(float __w) {
1962     return _mm_set1_ps(__w);
1963 }
1964 
1965 /// Constructs a 128-bit floating-point vector of [4 x float]
1966 ///    initialized with the specified single-precision floating-point values.
1967 ///
1968 /// \headerfile <x86intrin.h>
1969 ///
1970 /// This intrinsic is a utility function and does not correspond to a specific
1971 ///    instruction.
1972 ///
1973 /// \param __z
1974 ///    A single-precision floating-point value used to initialize bits [127:96]
1975 ///    of the result.
1976 /// \param __y
1977 ///    A single-precision floating-point value used to initialize bits [95:64]
1978 ///    of the result.
1979 /// \param __x
1980 ///    A single-precision floating-point value used to initialize bits [63:32]
1981 ///    of the result.
1982 /// \param __w
1983 ///    A single-precision floating-point value used to initialize bits [31:0]
1984 ///    of the result.
1985 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1986 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_set_ps(float __z,float __y,float __x,float __w)1987 _mm_set_ps(float __z, float __y, float __x, float __w) {
1988   return __extension__ (__m128){ __w, __x, __y, __z };
1989 }
1990 
1991 /// Constructs a 128-bit floating-point vector of [4 x float],
1992 ///    initialized in reverse order with the specified 32-bit single-precision
1993 ///    float-point values.
1994 ///
1995 /// \headerfile <x86intrin.h>
1996 ///
1997 /// This intrinsic is a utility function and does not correspond to a specific
1998 ///    instruction.
1999 ///
2000 /// \param __z
2001 ///    A single-precision floating-point value used to initialize bits [31:0]
2002 ///    of the result.
2003 /// \param __y
2004 ///    A single-precision floating-point value used to initialize bits [63:32]
2005 ///    of the result.
2006 /// \param __x
2007 ///    A single-precision floating-point value used to initialize bits [95:64]
2008 ///    of the result.
2009 /// \param __w
2010 ///    A single-precision floating-point value used to initialize bits [127:96]
2011 ///    of the result.
2012 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2013 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_setr_ps(float __z,float __y,float __x,float __w)2014 _mm_setr_ps(float __z, float __y, float __x, float __w) {
2015   return __extension__ (__m128){ __z, __y, __x, __w };
2016 }
2017 
2018 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2019 ///    to zero.
2020 ///
2021 /// \headerfile <x86intrin.h>
2022 ///
2023 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2024 ///
2025 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2026 ///    all elements set to zero.
2027 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_setzero_ps(void)2028 _mm_setzero_ps(void) {
2029   return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2030 }
2031 
2032 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2033 ///    memory location.
2034 ///
2035 /// \headerfile <x86intrin.h>
2036 ///
2037 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2038 ///
2039 /// \param __p
2040 ///    A pointer to a 64-bit memory location.
2041 /// \param __a
2042 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2043 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pi(__m64 * __p,__m128 __a)2044 _mm_storeh_pi(__m64 *__p, __m128 __a)
2045 {
2046   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2047   struct __mm_storeh_pi_struct {
2048     __mm_storeh_pi_v2f32 __u;
2049   } __attribute__((__packed__, __may_alias__));
2050   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2051 }
2052 
2053 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2054 ///     memory location.
2055 ///
2056 /// \headerfile <x86intrin.h>
2057 ///
2058 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2059 ///
2060 /// \param __p
2061 ///    A pointer to a memory location that will receive the float values.
2062 /// \param __a
2063 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2064 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pi(__m64 * __p,__m128 __a)2065 _mm_storel_pi(__m64 *__p, __m128 __a)
2066 {
2067   typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2068   struct __mm_storeh_pi_struct {
2069     __mm_storeh_pi_v2f32 __u;
2070   } __attribute__((__packed__, __may_alias__));
2071   ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2072 }
2073 
2074 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2075 ///     memory location.
2076 ///
2077 /// \headerfile <x86intrin.h>
2078 ///
2079 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2080 ///
2081 /// \param __p
2082 ///    A pointer to a 32-bit memory location.
2083 /// \param __a
2084 ///    A 128-bit vector of [4 x float] containing the value to be stored.
2085 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ss(float * __p,__m128 __a)2086 _mm_store_ss(float *__p, __m128 __a)
2087 {
2088   struct __mm_store_ss_struct {
2089     float __u;
2090   } __attribute__((__packed__, __may_alias__));
2091   ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2092 }
2093 
2094 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2095 ///    location.
2096 ///
2097 /// \headerfile <x86intrin.h>
2098 ///
2099 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2100 ///
2101 /// \param __p
2102 ///    A pointer to a 128-bit memory location. The address of the memory
2103 ///    location does not have to be aligned.
2104 /// \param __a
2105 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2106 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_ps(float * __p,__m128 __a)2107 _mm_storeu_ps(float *__p, __m128 __a)
2108 {
2109   struct __storeu_ps {
2110     __m128_u __v;
2111   } __attribute__((__packed__, __may_alias__));
2112   ((struct __storeu_ps*)__p)->__v = __a;
2113 }
2114 
2115 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2116 ///    location.
2117 ///
2118 /// \headerfile <x86intrin.h>
2119 ///
2120 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2121 ///
2122 /// \param __p
2123 ///    A pointer to a 128-bit memory location. The address of the memory
2124 ///    location has to be 16-byte aligned.
2125 /// \param __a
2126 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2127 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps(float * __p,__m128 __a)2128 _mm_store_ps(float *__p, __m128 __a)
2129 {
2130   *(__m128*)__p = __a;
2131 }
2132 
2133 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2134 ///    four contiguous elements in an aligned memory location.
2135 ///
2136 /// \headerfile <x86intrin.h>
2137 ///
2138 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2139 ///    instruction.
2140 ///
2141 /// \param __p
2142 ///    A pointer to a 128-bit memory location.
2143 /// \param __a
2144 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2145 ///    of the four contiguous elements pointed by \a __p.
2146 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float * __p,__m128 __a)2147 _mm_store1_ps(float *__p, __m128 __a)
2148 {
2149   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2150   _mm_store_ps(__p, __a);
2151 }
2152 
2153 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2154 ///    four contiguous elements in an aligned memory location.
2155 ///
2156 /// \headerfile <x86intrin.h>
2157 ///
2158 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2159 ///    instruction.
2160 ///
2161 /// \param __p
2162 ///    A pointer to a 128-bit memory location.
2163 /// \param __a
2164 ///    A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2165 ///    of the four contiguous elements pointed by \a __p.
2166 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps1(float * __p,__m128 __a)2167 _mm_store_ps1(float *__p, __m128 __a)
2168 {
2169   _mm_store1_ps(__p, __a);
2170 }
2171 
2172 /// Stores float values from a 128-bit vector of [4 x float] to an
2173 ///    aligned memory location in reverse order.
2174 ///
2175 /// \headerfile <x86intrin.h>
2176 ///
2177 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2178 ///    instruction.
2179 ///
2180 /// \param __p
2181 ///    A pointer to a 128-bit memory location. The address of the memory
2182 ///    location has to be 128-bit aligned.
2183 /// \param __a
2184 ///    A 128-bit vector of [4 x float] containing the values to be stored.
2185 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_ps(float * __p,__m128 __a)2186 _mm_storer_ps(float *__p, __m128 __a)
2187 {
2188   __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2189   _mm_store_ps(__p, __a);
2190 }
2191 
2192 #define _MM_HINT_ET0 7
2193 #define _MM_HINT_ET1 6
2194 #define _MM_HINT_T0  3
2195 #define _MM_HINT_T1  2
2196 #define _MM_HINT_T2  1
2197 #define _MM_HINT_NTA 0
2198 
2199 #ifndef _MSC_VER
2200 // If _MSC_VER is defined, we use the builtin variant of _mm_prefetch.
2201 // Otherwise, we provide this macro, which includes a cast, allowing the user
2202 // to pass a pointer of any time. The _mm_prefetch accepts char to match MSVC.
2203 
2204 /// Loads one cache line of data from the specified address to a location
2205 ///    closer to the processor.
2206 ///
2207 /// \headerfile <x86intrin.h>
2208 ///
2209 /// \code
2210 /// void _mm_prefetch(const void *a, const int sel);
2211 /// \endcode
2212 ///
2213 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2214 ///
2215 /// \param a
2216 ///    A pointer to a memory location containing a cache line of data.
2217 /// \param sel
2218 ///    A predefined integer constant specifying the type of prefetch
2219 ///    operation: \n
2220 ///    _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2221 ///    PREFETCHNTA instruction will be generated. \n
2222 ///    _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2223 ///    be generated. \n
2224 ///    _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2225 ///    be generated. \n
2226 ///    _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2227 ///    be generated.
2228 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2229                                                  ((sel) >> 2) & 1, (sel) & 0x3))
2230 #endif
2231 
2232 /// Stores a 64-bit integer in the specified aligned memory location. To
2233 ///    minimize caching, the data is flagged as non-temporal (unlikely to be
2234 ///    used again soon).
2235 ///
2236 /// \headerfile <x86intrin.h>
2237 ///
2238 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2239 ///
2240 /// \param __p
2241 ///    A pointer to an aligned memory location used to store the register value.
2242 /// \param __a
2243 ///    A 64-bit integer containing the value to be stored.
2244 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_pi(void * __p,__m64 __a)2245 _mm_stream_pi(void *__p, __m64 __a)
2246 {
2247   __builtin_nontemporal_store(__a, (__m64 *)__p);
2248 }
2249 
2250 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2251 ///    128-bit aligned memory location. To minimize caching, the data is flagged
2252 ///    as non-temporal (unlikely to be used again soon).
2253 ///
2254 /// \headerfile <x86intrin.h>
2255 ///
2256 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2257 ///
2258 /// \param __p
2259 ///    A pointer to a 128-bit aligned memory location that will receive the
2260 ///    single-precision floating-point values.
2261 /// \param __a
2262 ///    A 128-bit vector of [4 x float] containing the values to be moved.
2263 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(void * __p,__m128 __a)2264 _mm_stream_ps(void *__p, __m128 __a)
2265 {
2266   __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2267 }
2268 
2269 #if defined(__cplusplus)
2270 extern "C" {
2271 #endif
2272 
2273 /// Forces strong memory ordering (serialization) between store
2274 ///    instructions preceding this instruction and store instructions following
2275 ///    this instruction, ensuring the system completes all previous stores
2276 ///    before executing subsequent stores.
2277 ///
2278 /// \headerfile <x86intrin.h>
2279 ///
2280 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2281 ///
2282 void _mm_sfence(void);
2283 
2284 #if defined(__cplusplus)
2285 } // extern "C"
2286 #endif
2287 
2288 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2289 ///    returns it, as specified by the immediate integer operand.
2290 ///
2291 /// \headerfile <x86intrin.h>
2292 ///
2293 /// \code
2294 /// int _mm_extract_pi16(__m64 a, int n);
2295 /// \endcode
2296 ///
2297 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2298 ///
2299 /// \param a
2300 ///    A 64-bit vector of [4 x i16].
2301 /// \param n
2302 ///    An immediate integer operand that determines which bits are extracted: \n
2303 ///    0: Bits [15:0] are copied to the destination. \n
2304 ///    1: Bits [31:16] are copied to the destination. \n
2305 ///    2: Bits [47:32] are copied to the destination. \n
2306 ///    3: Bits [63:48] are copied to the destination.
2307 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2308 #define _mm_extract_pi16(a, n) \
2309   ((int)(unsigned short)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2310 
2311 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2312 ///    and inserts the lower 16-bits of an integer operand at the 16-bit offset
2313 ///    specified by the immediate operand \a n.
2314 ///
2315 /// \headerfile <x86intrin.h>
2316 ///
2317 /// \code
2318 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2319 /// \endcode
2320 ///
2321 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2322 ///
2323 /// \param a
2324 ///    A 64-bit vector of [4 x i16].
2325 /// \param d
2326 ///    An integer. The lower 16-bit value from this operand is written to the
2327 ///    destination at the offset specified by operand \a n.
2328 /// \param n
2329 ///    An immediate integer operant that determines which the bits to be used
2330 ///    in the destination. \n
2331 ///    0: Bits [15:0] are copied to the destination. \n
2332 ///    1: Bits [31:16] are copied to the destination. \n
2333 ///    2: Bits [47:32] are copied to the destination. \n
2334 ///    3: Bits [63:48] are copied to the destination.  \n
2335 ///    The remaining bits in the destination are copied from the corresponding
2336 ///    bits in operand \a a.
2337 /// \returns A 64-bit integer vector containing the copied packed data from the
2338 ///    operands.
2339 #define _mm_insert_pi16(a, d, n) \
2340   ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2341 
2342 /// Compares each of the corresponding packed 16-bit integer values of
2343 ///    the 64-bit integer vectors, and writes the greater value to the
2344 ///    corresponding bits in the destination.
2345 ///
2346 /// \headerfile <x86intrin.h>
2347 ///
2348 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2349 ///
2350 /// \param __a
2351 ///    A 64-bit integer vector containing one of the source operands.
2352 /// \param __b
2353 ///    A 64-bit integer vector containing one of the source operands.
2354 /// \returns A 64-bit integer vector containing the comparison results.
2355 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_max_pi16(__m64 __a,__m64 __b)2356 _mm_max_pi16(__m64 __a, __m64 __b)
2357 {
2358   return (__m64)__builtin_elementwise_max((__v4hi)__a, (__v4hi)__b);
2359 }
2360 
2361 /// Compares each of the corresponding packed 8-bit unsigned integer
2362 ///    values of the 64-bit integer vectors, and writes the greater value to the
2363 ///    corresponding bits in the destination.
2364 ///
2365 /// \headerfile <x86intrin.h>
2366 ///
2367 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2368 ///
2369 /// \param __a
2370 ///    A 64-bit integer vector containing one of the source operands.
2371 /// \param __b
2372 ///    A 64-bit integer vector containing one of the source operands.
2373 /// \returns A 64-bit integer vector containing the comparison results.
2374 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_max_pu8(__m64 __a,__m64 __b)2375 _mm_max_pu8(__m64 __a, __m64 __b)
2376 {
2377   return (__m64)__builtin_elementwise_max((__v8qu)__a, (__v8qu)__b);
2378 }
2379 
2380 /// Compares each of the corresponding packed 16-bit integer values of
2381 ///    the 64-bit integer vectors, and writes the lesser value to the
2382 ///    corresponding bits in the destination.
2383 ///
2384 /// \headerfile <x86intrin.h>
2385 ///
2386 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2387 ///
2388 /// \param __a
2389 ///    A 64-bit integer vector containing one of the source operands.
2390 /// \param __b
2391 ///    A 64-bit integer vector containing one of the source operands.
2392 /// \returns A 64-bit integer vector containing the comparison results.
2393 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_min_pi16(__m64 __a,__m64 __b)2394 _mm_min_pi16(__m64 __a, __m64 __b)
2395 {
2396   return (__m64)__builtin_elementwise_min((__v4hi)__a, (__v4hi)__b);
2397 }
2398 
2399 /// Compares each of the corresponding packed 8-bit unsigned integer
2400 ///    values of the 64-bit integer vectors, and writes the lesser value to the
2401 ///    corresponding bits in the destination.
2402 ///
2403 /// \headerfile <x86intrin.h>
2404 ///
2405 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2406 ///
2407 /// \param __a
2408 ///    A 64-bit integer vector containing one of the source operands.
2409 /// \param __b
2410 ///    A 64-bit integer vector containing one of the source operands.
2411 /// \returns A 64-bit integer vector containing the comparison results.
2412 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_min_pu8(__m64 __a,__m64 __b)2413 _mm_min_pu8(__m64 __a, __m64 __b)
2414 {
2415   return (__m64)__builtin_elementwise_min((__v8qu)__a, (__v8qu)__b);
2416 }
2417 
2418 /// Takes the most significant bit from each 8-bit element in a 64-bit
2419 ///    integer vector to create an 8-bit mask value. Zero-extends the value to
2420 ///    32-bit integer and writes it to the destination.
2421 ///
2422 /// \headerfile <x86intrin.h>
2423 ///
2424 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2425 ///
2426 /// \param __a
2427 ///    A 64-bit integer vector containing the values with bits to be extracted.
2428 /// \returns The most significant bit from each 8-bit element in \a __a,
2429 ///    written to bits [7:0].
2430 static __inline__ int __DEFAULT_FN_ATTRS_SSE2
_mm_movemask_pi8(__m64 __a)2431 _mm_movemask_pi8(__m64 __a)
2432 {
2433   return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
2434 }
2435 
2436 /// Multiplies packed 16-bit unsigned integer values and writes the
2437 ///    high-order 16 bits of each 32-bit product to the corresponding bits in
2438 ///    the destination.
2439 ///
2440 /// \headerfile <x86intrin.h>
2441 ///
2442 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2443 ///
2444 /// \param __a
2445 ///    A 64-bit integer vector containing one of the source operands.
2446 /// \param __b
2447 ///    A 64-bit integer vector containing one of the source operands.
2448 /// \returns A 64-bit integer vector containing the products of both operands.
2449 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_mulhi_pu16(__m64 __a,__m64 __b)2450 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2451 {
2452   return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a),
2453                                              (__v8hi)__anyext128(__b)));
2454 }
2455 
2456 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2457 ///    destination, as specified by the immediate value operand.
2458 ///
2459 /// \headerfile <x86intrin.h>
2460 ///
2461 /// \code
2462 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2463 /// \endcode
2464 ///
2465 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2466 ///
2467 /// \param a
2468 ///    A 64-bit integer vector containing the values to be shuffled.
2469 /// \param n
2470 ///    An immediate value containing an 8-bit value specifying which elements to
2471 ///    copy from \a a. The destinations within the 64-bit destination are
2472 ///    assigned values as follows: \n
2473 ///    Bits [1:0] are used to assign values to bits [15:0] in the
2474 ///    destination. \n
2475 ///    Bits [3:2] are used to assign values to bits [31:16] in the
2476 ///    destination. \n
2477 ///    Bits [5:4] are used to assign values to bits [47:32] in the
2478 ///    destination. \n
2479 ///    Bits [7:6] are used to assign values to bits [63:48] in the
2480 ///    destination. \n
2481 ///    Bit value assignments: \n
2482 ///    00: assigned from bits [15:0] of \a a. \n
2483 ///    01: assigned from bits [31:16] of \a a. \n
2484 ///    10: assigned from bits [47:32] of \a a. \n
2485 ///    11: assigned from bits [63:48] of \a a. \n
2486 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2487 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2488 ///    <c>[b6, b4, b2, b0]</c>.
2489 /// \returns A 64-bit integer vector containing the shuffled values.
2490 #define _mm_shuffle_pi16(a, n)                                                 \
2491   ((__m64)__builtin_shufflevector((__v4hi)(__m64)(a), __extension__(__v4hi){}, \
2492                                   (n) & 0x3, ((n) >> 2) & 0x3,                 \
2493                                   ((n) >> 4) & 0x3, ((n) >> 6) & 0x3))
2494 
2495 /// Conditionally copies the values from each 8-bit element in the first
2496 ///    64-bit integer vector operand to the specified memory location, as
2497 ///    specified by the most significant bit in the corresponding element in the
2498 ///    second 64-bit integer vector operand.
2499 ///
2500 ///    To minimize caching, the data is flagged as non-temporal
2501 ///    (unlikely to be used again soon).
2502 ///
2503 /// \headerfile <x86intrin.h>
2504 ///
2505 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2506 ///
2507 /// \param __d
2508 ///    A 64-bit integer vector containing the values with elements to be copied.
2509 /// \param __n
2510 ///    A 64-bit integer vector operand. The most significant bit from each 8-bit
2511 ///    element determines whether the corresponding element in operand \a __d
2512 ///    is copied. If the most significant bit of a given element is 1, the
2513 ///    corresponding element in operand \a __d is copied.
2514 /// \param __p
2515 ///    A pointer to a 64-bit memory location that will receive the conditionally
2516 ///    copied integer values. The address of the memory location does not have
2517 ///    to be aligned.
2518 static __inline__ void __DEFAULT_FN_ATTRS_SSE2
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)2519 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2520 {
2521   // This is complex, because we need to support the case where __p is pointing
2522   // within the last 15 to 8 bytes of a page. In that case, using a 128-bit
2523   // write might cause a trap where a 64-bit maskmovq would not. (Memory
2524   // locations not selected by the mask bits might still cause traps.)
2525   __m128i __d128  = __anyext128(__d);
2526   __m128i __n128  = __zext128(__n);
2527   if (((__SIZE_TYPE__)__p & 0xfff) >= 4096-15 &&
2528       ((__SIZE_TYPE__)__p & 0xfff) <= 4096-8) {
2529     // If there's a risk of spurious trap due to a 128-bit write, back up the
2530     // pointer by 8 bytes and shift values in registers to match.
2531     __p -= 8;
2532     __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2533     __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2534   }
2535 
2536   __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
2537 }
2538 
2539 /// Computes the rounded averages of the packed unsigned 8-bit integer
2540 ///    values and writes the averages to the corresponding bits in the
2541 ///    destination.
2542 ///
2543 /// \headerfile <x86intrin.h>
2544 ///
2545 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2546 ///
2547 /// \param __a
2548 ///    A 64-bit integer vector containing one of the source operands.
2549 /// \param __b
2550 ///    A 64-bit integer vector containing one of the source operands.
2551 /// \returns A 64-bit integer vector containing the averages of both operands.
2552 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_avg_pu8(__m64 __a,__m64 __b)2553 _mm_avg_pu8(__m64 __a, __m64 __b)
2554 {
2555   return __trunc64(__builtin_ia32_pavgb128((__v16qi)__anyext128(__a),
2556                                            (__v16qi)__anyext128(__b)));
2557 }
2558 
2559 /// Computes the rounded averages of the packed unsigned 16-bit integer
2560 ///    values and writes the averages to the corresponding bits in the
2561 ///    destination.
2562 ///
2563 /// \headerfile <x86intrin.h>
2564 ///
2565 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2566 ///
2567 /// \param __a
2568 ///    A 64-bit integer vector containing one of the source operands.
2569 /// \param __b
2570 ///    A 64-bit integer vector containing one of the source operands.
2571 /// \returns A 64-bit integer vector containing the averages of both operands.
2572 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_avg_pu16(__m64 __a,__m64 __b)2573 _mm_avg_pu16(__m64 __a, __m64 __b)
2574 {
2575   return __trunc64(__builtin_ia32_pavgw128((__v8hi)__anyext128(__a),
2576                                            (__v8hi)__anyext128(__b)));
2577 }
2578 
2579 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2580 ///    64-bit vector operands and computes the absolute value for each of the
2581 ///    difference. Then sum of the 8 absolute differences is written to the
2582 ///    bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2583 ///
2584 /// \headerfile <x86intrin.h>
2585 ///
2586 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2587 ///
2588 /// \param __a
2589 ///    A 64-bit integer vector containing one of the source operands.
2590 /// \param __b
2591 ///    A 64-bit integer vector containing one of the source operands.
2592 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2593 ///    sets of absolute differences between both operands. The upper bits are
2594 ///    cleared.
2595 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_sad_pu8(__m64 __a,__m64 __b)2596 _mm_sad_pu8(__m64 __a, __m64 __b)
2597 {
2598   return __trunc64(__builtin_ia32_psadbw128((__v16qi)__zext128(__a),
2599                                             (__v16qi)__zext128(__b)));
2600 }
2601 
2602 #if defined(__cplusplus)
2603 extern "C" {
2604 #endif
2605 
2606 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2607 ///    integer value.
2608 ///
2609 ///    There are several groups of macros associated with this
2610 ///    intrinsic, including:
2611 ///    <ul>
2612 ///    <li>
2613 ///      For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2614 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2615 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2616 ///      _MM_GET_EXCEPTION_STATE().
2617 ///    </li>
2618 ///    <li>
2619 ///      For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2620 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2621 ///      There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2622 ///    </li>
2623 ///    <li>
2624 ///      For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2625 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2626 ///      _MM_GET_ROUNDING_MODE().
2627 ///    </li>
2628 ///    <li>
2629 ///      For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2630 ///      There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2631 ///    </li>
2632 ///    <li>
2633 ///      For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2634 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2635 ///      _MM_GET_DENORMALS_ZERO_MODE().
2636 ///    </li>
2637 ///    </ul>
2638 ///
2639 ///    For example, the following expression checks if an overflow exception has
2640 ///    occurred:
2641 ///    \code
2642 ///      ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2643 ///    \endcode
2644 ///
2645 ///    The following expression gets the current rounding mode:
2646 ///    \code
2647 ///      _MM_GET_ROUNDING_MODE()
2648 ///    \endcode
2649 ///
2650 /// \headerfile <x86intrin.h>
2651 ///
2652 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2653 ///
2654 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2655 ///    register.
2656 unsigned int _mm_getcsr(void);
2657 
2658 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2659 ///
2660 ///    There are several groups of macros associated with this intrinsic,
2661 ///    including:
2662 ///    <ul>
2663 ///    <li>
2664 ///      For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2665 ///      _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2666 ///      _MM_EXCEPT_INEXACT. There is a convenience wrapper
2667 ///      _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2668 ///    </li>
2669 ///    <li>
2670 ///      For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2671 ///      _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2672 ///      There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2673 ///      of these macros.
2674 ///    </li>
2675 ///    <li>
2676 ///      For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2677 ///      _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2678 ///      _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2679 ///    </li>
2680 ///    <li>
2681 ///      For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2682 ///      There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2683 ///      one of these macros.
2684 ///    </li>
2685 ///    <li>
2686 ///      For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2687 ///      _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2688 ///      _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2689 ///    </li>
2690 ///    </ul>
2691 ///
2692 ///    For example, the following expression causes subsequent floating-point
2693 ///    operations to round up:
2694 ///      _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2695 ///
2696 ///    The following example sets the DAZ and FTZ flags:
2697 ///    \code
2698 ///    void setFlags() {
2699 ///      _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2700 ///      _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2701 ///    }
2702 ///    \endcode
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2707 ///
2708 /// \param __i
2709 ///    A 32-bit unsigned integer value to be written to the MXCSR register.
2710 void _mm_setcsr(unsigned int __i);
2711 
2712 #if defined(__cplusplus)
2713 } // extern "C"
2714 #endif
2715 
2716 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2717 ///    specified by the immediate value operand.
2718 ///
2719 /// \headerfile <x86intrin.h>
2720 ///
2721 /// \code
2722 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2723 /// \endcode
2724 ///
2725 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2726 ///
2727 /// \param a
2728 ///    A 128-bit vector of [4 x float].
2729 /// \param b
2730 ///    A 128-bit vector of [4 x float].
2731 /// \param mask
2732 ///    An immediate value containing an 8-bit value specifying which elements to
2733 ///    copy from \a a and \a b. \n
2734 ///    Bits [3:0] specify the values copied from operand \a a. \n
2735 ///    Bits [7:4] specify the values copied from operand \a b. \n
2736 ///    The destinations within the 128-bit destination are assigned values as
2737 ///    follows: \n
2738 ///    Bits [1:0] are used to assign values to bits [31:0] in the
2739 ///    destination. \n
2740 ///    Bits [3:2] are used to assign values to bits [63:32] in the
2741 ///    destination. \n
2742 ///    Bits [5:4] are used to assign values to bits [95:64] in the
2743 ///    destination. \n
2744 ///    Bits [7:6] are used to assign values to bits [127:96] in the
2745 ///    destination. \n
2746 ///    Bit value assignments: \n
2747 ///    00: Bits [31:0] copied from the specified operand. \n
2748 ///    01: Bits [63:32] copied from the specified operand. \n
2749 ///    10: Bits [95:64] copied from the specified operand. \n
2750 ///    11: Bits [127:96] copied from the specified operand. \n
2751 ///    Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2752 ///    <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2753 ///    <c>[b6, b4, b2, b0]</c>.
2754 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2755 #define _mm_shuffle_ps(a, b, mask) \
2756   ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2757                                  (int)(mask)))
2758 
2759 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2760 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2761 ///
2762 /// \headerfile <x86intrin.h>
2763 ///
2764 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2765 ///
2766 /// \param __a
2767 ///    A 128-bit vector of [4 x float]. \n
2768 ///    Bits [95:64] are written to bits [31:0] of the destination. \n
2769 ///    Bits [127:96] are written to bits [95:64] of the destination.
2770 /// \param __b
2771 ///    A 128-bit vector of [4 x float].
2772 ///    Bits [95:64] are written to bits [63:32] of the destination. \n
2773 ///    Bits [127:96] are written to bits [127:96] of the destination.
2774 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2775 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_unpackhi_ps(__m128 __a,__m128 __b)2776 _mm_unpackhi_ps(__m128 __a, __m128 __b) {
2777   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2778 }
2779 
2780 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2781 ///    [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2782 ///
2783 /// \headerfile <x86intrin.h>
2784 ///
2785 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2786 ///
2787 /// \param __a
2788 ///    A 128-bit vector of [4 x float]. \n
2789 ///    Bits [31:0] are written to bits [31:0] of the destination.  \n
2790 ///    Bits [63:32] are written to bits [95:64] of the destination.
2791 /// \param __b
2792 ///    A 128-bit vector of [4 x float]. \n
2793 ///    Bits [31:0] are written to bits [63:32] of the destination. \n
2794 ///    Bits [63:32] are written to bits [127:96] of the destination.
2795 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2796 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_unpacklo_ps(__m128 __a,__m128 __b)2797 _mm_unpacklo_ps(__m128 __a, __m128 __b) {
2798   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2799 }
2800 
2801 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2802 ///    32 bits are set to the lower 32 bits of the second parameter. The upper
2803 ///    96 bits are set to the upper 96 bits of the first parameter.
2804 ///
2805 /// \headerfile <x86intrin.h>
2806 ///
2807 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2808 ///    instruction.
2809 ///
2810 /// \param __a
2811 ///    A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2812 ///    written to the upper 96 bits of the result.
2813 /// \param __b
2814 ///    A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2815 ///    written to the lower 32 bits of the result.
2816 /// \returns A 128-bit floating-point vector of [4 x float].
2817 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_move_ss(__m128 __a,__m128 __b)2818 _mm_move_ss(__m128 __a, __m128 __b) {
2819   __a[0] = __b[0];
2820   return __a;
2821 }
2822 
2823 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2824 ///    64 bits are set to the upper 64 bits of the second parameter. The upper
2825 ///    64 bits are set to the upper 64 bits of the first parameter.
2826 ///
2827 /// \headerfile <x86intrin.h>
2828 ///
2829 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2830 ///
2831 /// \param __a
2832 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2833 ///    written to the upper 64 bits of the result.
2834 /// \param __b
2835 ///    A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2836 ///    written to the lower 64 bits of the result.
2837 /// \returns A 128-bit floating-point vector of [4 x float].
2838 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_movehl_ps(__m128 __a,__m128 __b)2839 _mm_movehl_ps(__m128 __a, __m128 __b) {
2840   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2841 }
2842 
2843 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2844 ///    64 bits are set to the lower 64 bits of the first parameter. The upper
2845 ///    64 bits are set to the lower 64 bits of the second parameter.
2846 ///
2847 /// \headerfile <x86intrin.h>
2848 ///
2849 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2850 ///
2851 /// \param __a
2852 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2853 ///    written to the lower 64 bits of the result.
2854 /// \param __b
2855 ///    A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2856 ///    written to the upper 64 bits of the result.
2857 /// \returns A 128-bit floating-point vector of [4 x float].
2858 static __inline__ __m128 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_movelh_ps(__m128 __a,__m128 __b)2859 _mm_movelh_ps(__m128 __a, __m128 __b) {
2860   return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2861 }
2862 
2863 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2864 ///    float].
2865 ///
2866 /// \headerfile <x86intrin.h>
2867 ///
2868 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2869 ///
2870 /// \param __a
2871 ///    A 64-bit vector of [4 x i16]. The elements of the destination are copied
2872 ///    from the corresponding elements in this operand.
2873 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2874 ///    values from the operand.
2875 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi16_ps(__m64 __a)2876 _mm_cvtpi16_ps(__m64 __a)
2877 {
2878   return __builtin_convertvector((__v4hi)__a, __v4sf);
2879 }
2880 
2881 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2882 ///    128-bit vector of [4 x float].
2883 ///
2884 /// \headerfile <x86intrin.h>
2885 ///
2886 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2887 ///
2888 /// \param __a
2889 ///    A 64-bit vector of 16-bit unsigned integer values. The elements of the
2890 ///    destination are copied from the corresponding elements in this operand.
2891 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2892 ///    values from the operand.
2893 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpu16_ps(__m64 __a)2894 _mm_cvtpu16_ps(__m64 __a)
2895 {
2896   return __builtin_convertvector((__v4hu)__a, __v4sf);
2897 }
2898 
2899 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2900 ///    into a 128-bit vector of [4 x float].
2901 ///
2902 /// \headerfile <x86intrin.h>
2903 ///
2904 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2905 ///
2906 /// \param __a
2907 ///    A 64-bit vector of [8 x i8]. The elements of the destination are copied
2908 ///    from the corresponding lower 4 elements in this operand.
2909 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2910 ///    values from the operand.
2911 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi8_ps(__m64 __a)2912 _mm_cvtpi8_ps(__m64 __a)
2913 {
2914   return __builtin_convertvector(
2915       __builtin_shufflevector((__v8qs)__a, __extension__ (__v8qs){},
2916                               0, 1, 2, 3), __v4sf);
2917 }
2918 
2919 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2920 ///    vector of [8 x u8] into a 128-bit vector of [4 x float].
2921 ///
2922 /// \headerfile <x86intrin.h>
2923 ///
2924 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2925 ///
2926 /// \param __a
2927 ///    A 64-bit vector of unsigned 8-bit integer values. The elements of the
2928 ///    destination are copied from the corresponding lower 4 elements in this
2929 ///    operand.
2930 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2931 ///    values from the source operand.
2932 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpu8_ps(__m64 __a)2933 _mm_cvtpu8_ps(__m64 __a)
2934 {
2935   return __builtin_convertvector(
2936       __builtin_shufflevector((__v8qu)__a, __extension__ (__v8qu){},
2937                               0, 1, 2, 3), __v4sf);
2938 }
2939 
2940 /// Converts the two 32-bit signed integer values from each 64-bit vector
2941 ///    operand of [2 x i32] into a 128-bit vector of [4 x float].
2942 ///
2943 /// \headerfile <x86intrin.h>
2944 ///
2945 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2946 ///
2947 /// \param __a
2948 ///    A 64-bit vector of [2 x i32]. The lower elements of the destination are
2949 ///    copied from the elements in this operand.
2950 /// \param __b
2951 ///    A 64-bit vector of [2 x i32]. The upper elements of the destination are
2952 ///    copied from the elements in this operand.
2953 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2954 ///    copied and converted values from the first operand. The upper 64 bits
2955 ///    contain the copied and converted values from the second operand.
2956 static __inline__ __m128 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)2957 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2958 {
2959   return __builtin_convertvector(
2960       __builtin_shufflevector((__v2si)__a, (__v2si)__b,
2961                               0, 1, 2, 3), __v4sf);
2962 }
2963 
2964 /// Converts each single-precision floating-point element of a 128-bit
2965 ///    floating-point vector of [4 x float] into a 16-bit signed integer, and
2966 ///    packs the results into a 64-bit integer vector of [4 x i16].
2967 ///
2968 ///    If the floating-point element is NaN or infinity, or if the
2969 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2970 ///    it is converted to 0x8000. Otherwise if the floating-point element is
2971 ///    greater than 0x7FFF, it is converted to 0x7FFF.
2972 ///
2973 /// \headerfile <x86intrin.h>
2974 ///
2975 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2976 ///
2977 /// \param __a
2978 ///    A 128-bit floating-point vector of [4 x float].
2979 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2980 ///    values.
2981 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi16(__m128 __a)2982 _mm_cvtps_pi16(__m128 __a)
2983 {
2984   return __trunc64(__builtin_ia32_packssdw128(
2985       (__v4si)__builtin_ia32_cvtps2dq((__v4sf)__a), (__v4si)_mm_setzero_ps()));
2986 }
2987 
2988 /// Converts each single-precision floating-point element of a 128-bit
2989 ///    floating-point vector of [4 x float] into an 8-bit signed integer, and
2990 ///    packs the results into the lower 32 bits of a 64-bit integer vector of
2991 ///    [8 x i8]. The upper 32 bits of the vector are set to 0.
2992 ///
2993 ///    If the floating-point element is NaN or infinity, or if the
2994 ///    floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
2995 ///    is converted to 0x80. Otherwise if the floating-point element is greater
2996 ///    than 0x7F, it is converted to 0x7F.
2997 ///
2998 /// \headerfile <x86intrin.h>
2999 ///
3000 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3001 ///
3002 /// \param __a
3003 ///    128-bit floating-point vector of [4 x float].
3004 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3005 ///    converted values and the uppper 32 bits are set to zero.
3006 static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2
_mm_cvtps_pi8(__m128 __a)3007 _mm_cvtps_pi8(__m128 __a)
3008 {
3009   __m64 __b, __c;
3010 
3011   __b = _mm_cvtps_pi16(__a);
3012   __c = _mm_setzero_si64();
3013 
3014   return _mm_packs_pi16(__b, __c);
3015 }
3016 
3017 /// Extracts the sign bits from each single-precision floating-point
3018 ///    element of a 128-bit floating-point vector of [4 x float] and returns the
3019 ///    sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3020 ///    to zero.
3021 ///
3022 /// \headerfile <x86intrin.h>
3023 ///
3024 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3025 ///
3026 /// \param __a
3027 ///    A 128-bit floating-point vector of [4 x float].
3028 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3029 ///    single-precision floating-point element of the parameter. Bits [31:4] are
3030 ///    set to zero.
3031 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_ps(__m128 __a)3032 _mm_movemask_ps(__m128 __a)
3033 {
3034   return __builtin_ia32_movmskps((__v4sf)__a);
3035 }
3036 
3037 /* Compare */
3038 #define _CMP_EQ_OQ    0x00 /* Equal (ordered, non-signaling)  */
3039 #define _CMP_LT_OS    0x01 /* Less-than (ordered, signaling)  */
3040 #define _CMP_LE_OS    0x02 /* Less-than-or-equal (ordered, signaling)  */
3041 #define _CMP_UNORD_Q  0x03 /* Unordered (non-signaling)  */
3042 #define _CMP_NEQ_UQ   0x04 /* Not-equal (unordered, non-signaling)  */
3043 #define _CMP_NLT_US   0x05 /* Not-less-than (unordered, signaling)  */
3044 #define _CMP_NLE_US   0x06 /* Not-less-than-or-equal (unordered, signaling)  */
3045 #define _CMP_ORD_Q    0x07 /* Ordered (non-signaling)   */
3046 
3047 /// Compares each of the corresponding values of two 128-bit vectors of
3048 ///    [4 x float], using the operation specified by the immediate integer
3049 ///    operand.
3050 ///
3051 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3052 ///    If either value in a comparison is NaN, comparisons that are ordered
3053 ///    return false, and comparisons that are unordered return true.
3054 ///
3055 /// \headerfile <x86intrin.h>
3056 ///
3057 /// \code
3058 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3059 /// \endcode
3060 ///
3061 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3062 ///
3063 /// \param a
3064 ///    A 128-bit vector of [4 x float].
3065 /// \param b
3066 ///    A 128-bit vector of [4 x float].
3067 /// \param c
3068 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3069 ///    operation to use: \n
3070 ///    0x00: Equal (ordered, non-signaling) \n
3071 ///    0x01: Less-than (ordered, signaling) \n
3072 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3073 ///    0x03: Unordered (non-signaling) \n
3074 ///    0x04: Not-equal (unordered, non-signaling) \n
3075 ///    0x05: Not-less-than (unordered, signaling) \n
3076 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3077 ///    0x07: Ordered (non-signaling) \n
3078 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3079 #define _mm_cmp_ps(a, b, c)                                                    \
3080   ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3081 
3082 /// Compares each of the corresponding scalar values of two 128-bit
3083 ///    vectors of [4 x float], using the operation specified by the immediate
3084 ///    integer operand.
3085 ///
3086 ///    Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3087 ///    If either value in a comparison is NaN, comparisons that are ordered
3088 ///    return false, and comparisons that are unordered return true.
3089 ///
3090 /// \headerfile <x86intrin.h>
3091 ///
3092 /// \code
3093 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3094 /// \endcode
3095 ///
3096 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3097 ///
3098 /// \param a
3099 ///    A 128-bit vector of [4 x float].
3100 /// \param b
3101 ///    A 128-bit vector of [4 x float].
3102 /// \param c
3103 ///    An immediate integer operand, with bits [4:0] specifying which comparison
3104 ///    operation to use: \n
3105 ///    0x00: Equal (ordered, non-signaling) \n
3106 ///    0x01: Less-than (ordered, signaling) \n
3107 ///    0x02: Less-than-or-equal (ordered, signaling) \n
3108 ///    0x03: Unordered (non-signaling) \n
3109 ///    0x04: Not-equal (unordered, non-signaling) \n
3110 ///    0x05: Not-less-than (unordered, signaling) \n
3111 ///    0x06: Not-less-than-or-equal (unordered, signaling) \n
3112 ///    0x07: Ordered (non-signaling) \n
3113 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3114 #define _mm_cmp_ss(a, b, c)                                                    \
3115   ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3116 
3117 #define _MM_ALIGN16 __attribute__((aligned(16)))
3118 
3119 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3120 
3121 #define _MM_EXCEPT_INVALID    (0x0001U)
3122 #define _MM_EXCEPT_DENORM     (0x0002U)
3123 #define _MM_EXCEPT_DIV_ZERO   (0x0004U)
3124 #define _MM_EXCEPT_OVERFLOW   (0x0008U)
3125 #define _MM_EXCEPT_UNDERFLOW  (0x0010U)
3126 #define _MM_EXCEPT_INEXACT    (0x0020U)
3127 #define _MM_EXCEPT_MASK       (0x003fU)
3128 
3129 #define _MM_MASK_INVALID      (0x0080U)
3130 #define _MM_MASK_DENORM       (0x0100U)
3131 #define _MM_MASK_DIV_ZERO     (0x0200U)
3132 #define _MM_MASK_OVERFLOW     (0x0400U)
3133 #define _MM_MASK_UNDERFLOW    (0x0800U)
3134 #define _MM_MASK_INEXACT      (0x1000U)
3135 #define _MM_MASK_MASK         (0x1f80U)
3136 
3137 #define _MM_ROUND_NEAREST     (0x0000U)
3138 #define _MM_ROUND_DOWN        (0x2000U)
3139 #define _MM_ROUND_UP          (0x4000U)
3140 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3141 #define _MM_ROUND_MASK        (0x6000U)
3142 
3143 #define _MM_FLUSH_ZERO_MASK   (0x8000U)
3144 #define _MM_FLUSH_ZERO_ON     (0x8000U)
3145 #define _MM_FLUSH_ZERO_OFF    (0x0000U)
3146 
3147 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3148 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3149 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3150 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3151 
3152 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3153 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3154 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3155 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3156 
3157 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3158 do { \
3159   __m128 tmp3, tmp2, tmp1, tmp0; \
3160   tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3161   tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3162   tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3163   tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3164   (row0) = _mm_movelh_ps(tmp0, tmp2); \
3165   (row1) = _mm_movehl_ps(tmp2, tmp0); \
3166   (row2) = _mm_movelh_ps(tmp1, tmp3); \
3167   (row3) = _mm_movehl_ps(tmp3, tmp1); \
3168 } while (0)
3169 
3170 /* Aliases for compatibility. */
3171 #define _m_pextrw _mm_extract_pi16
3172 #define _m_pinsrw _mm_insert_pi16
3173 #define _m_pmaxsw _mm_max_pi16
3174 #define _m_pmaxub _mm_max_pu8
3175 #define _m_pminsw _mm_min_pi16
3176 #define _m_pminub _mm_min_pu8
3177 #define _m_pmovmskb _mm_movemask_pi8
3178 #define _m_pmulhuw _mm_mulhi_pu16
3179 #define _m_pshufw _mm_shuffle_pi16
3180 #define _m_maskmovq _mm_maskmove_si64
3181 #define _m_pavgb _mm_avg_pu8
3182 #define _m_pavgw _mm_avg_pu16
3183 #define _m_psadbw _mm_sad_pu8
3184 #define _m_ _mm_
3185 
3186 #undef __trunc64
3187 #undef __zext128
3188 #undef __anyext128
3189 #undef __zeroupper64
3190 #undef __DEFAULT_FN_ATTRS
3191 #undef __DEFAULT_FN_ATTRS_CONSTEXPR
3192 #undef __DEFAULT_FN_ATTRS_SSE2
3193 #undef __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
3194 
3195 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3196 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3197 #include <emmintrin.h>
3198 #endif
3199 
3200 #endif /* __XMMINTRIN_H */
3201