1 /*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __XMMINTRIN_H
11 #define __XMMINTRIN_H
12
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16
17 #include <mmintrin.h>
18
19 typedef int __v4si __attribute__((__vector_size__(16)));
20 typedef float __v4sf __attribute__((__vector_size__(16)));
21 typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
22
23 typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
24
25 /* Unsigned types */
26 typedef unsigned int __v4su __attribute__((__vector_size__(16)));
27
28 /* This header should only be included in a hosted environment as it depends on
29 * a standard library to provide allocation routines. */
30 #if __STDC_HOSTED__
31 #include <mm_malloc.h>
32 #endif
33
34 /* Define the default attributes for the functions in this file. */
35 #define __DEFAULT_FN_ATTRS \
36 __attribute__((__always_inline__, __nodebug__, __target__("sse,no-evex512"), \
37 __min_vector_width__(128)))
38 #define __DEFAULT_FN_ATTRS_MMX \
39 __attribute__((__always_inline__, __nodebug__, \
40 __target__("mmx,sse,no-evex512"), __min_vector_width__(64)))
41
42 /// Adds the 32-bit float values in the low-order bits of the operands.
43 ///
44 /// \headerfile <x86intrin.h>
45 ///
46 /// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
47 ///
48 /// \param __a
49 /// A 128-bit vector of [4 x float] containing one of the source operands.
50 /// The lower 32 bits of this operand are used in the calculation.
51 /// \param __b
52 /// A 128-bit vector of [4 x float] containing one of the source operands.
53 /// The lower 32 bits of this operand are used in the calculation.
54 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
55 /// of the lower 32 bits of both operands. The upper 96 bits are copied from
56 /// the upper 96 bits of the first source operand.
57 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ss(__m128 __a,__m128 __b)58 _mm_add_ss(__m128 __a, __m128 __b)
59 {
60 __a[0] += __b[0];
61 return __a;
62 }
63
64 /// Adds two 128-bit vectors of [4 x float], and returns the results of
65 /// the addition.
66 ///
67 /// \headerfile <x86intrin.h>
68 ///
69 /// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
70 ///
71 /// \param __a
72 /// A 128-bit vector of [4 x float] containing one of the source operands.
73 /// \param __b
74 /// A 128-bit vector of [4 x float] containing one of the source operands.
75 /// \returns A 128-bit vector of [4 x float] containing the sums of both
76 /// operands.
77 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_add_ps(__m128 __a,__m128 __b)78 _mm_add_ps(__m128 __a, __m128 __b)
79 {
80 return (__m128)((__v4sf)__a + (__v4sf)__b);
81 }
82
83 /// Subtracts the 32-bit float value in the low-order bits of the second
84 /// operand from the corresponding value in the first operand.
85 ///
86 /// \headerfile <x86intrin.h>
87 ///
88 /// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
89 ///
90 /// \param __a
91 /// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
92 /// of this operand are used in the calculation.
93 /// \param __b
94 /// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
95 /// bits of this operand are used in the calculation.
96 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
97 /// difference of the lower 32 bits of both operands. The upper 96 bits are
98 /// copied from the upper 96 bits of the first source operand.
99 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ss(__m128 __a,__m128 __b)100 _mm_sub_ss(__m128 __a, __m128 __b)
101 {
102 __a[0] -= __b[0];
103 return __a;
104 }
105
106 /// Subtracts each of the values of the second operand from the first
107 /// operand, both of which are 128-bit vectors of [4 x float] and returns
108 /// the results of the subtraction.
109 ///
110 /// \headerfile <x86intrin.h>
111 ///
112 /// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
113 ///
114 /// \param __a
115 /// A 128-bit vector of [4 x float] containing the minuend.
116 /// \param __b
117 /// A 128-bit vector of [4 x float] containing the subtrahend.
118 /// \returns A 128-bit vector of [4 x float] containing the differences between
119 /// both operands.
120 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sub_ps(__m128 __a,__m128 __b)121 _mm_sub_ps(__m128 __a, __m128 __b)
122 {
123 return (__m128)((__v4sf)__a - (__v4sf)__b);
124 }
125
126 /// Multiplies two 32-bit float values in the low-order bits of the
127 /// operands.
128 ///
129 /// \headerfile <x86intrin.h>
130 ///
131 /// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
132 ///
133 /// \param __a
134 /// A 128-bit vector of [4 x float] containing one of the source operands.
135 /// The lower 32 bits of this operand are used in the calculation.
136 /// \param __b
137 /// A 128-bit vector of [4 x float] containing one of the source operands.
138 /// The lower 32 bits of this operand are used in the calculation.
139 /// \returns A 128-bit vector of [4 x float] containing the product of the lower
140 /// 32 bits of both operands. The upper 96 bits are copied from the upper 96
141 /// bits of the first source operand.
142 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ss(__m128 __a,__m128 __b)143 _mm_mul_ss(__m128 __a, __m128 __b)
144 {
145 __a[0] *= __b[0];
146 return __a;
147 }
148
149 /// Multiplies two 128-bit vectors of [4 x float] and returns the
150 /// results of the multiplication.
151 ///
152 /// \headerfile <x86intrin.h>
153 ///
154 /// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
155 ///
156 /// \param __a
157 /// A 128-bit vector of [4 x float] containing one of the source operands.
158 /// \param __b
159 /// A 128-bit vector of [4 x float] containing one of the source operands.
160 /// \returns A 128-bit vector of [4 x float] containing the products of both
161 /// operands.
162 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_mul_ps(__m128 __a,__m128 __b)163 _mm_mul_ps(__m128 __a, __m128 __b)
164 {
165 return (__m128)((__v4sf)__a * (__v4sf)__b);
166 }
167
168 /// Divides the value in the low-order 32 bits of the first operand by
169 /// the corresponding value in the second operand.
170 ///
171 /// \headerfile <x86intrin.h>
172 ///
173 /// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
174 ///
175 /// \param __a
176 /// A 128-bit vector of [4 x float] containing the dividend. The lower 32
177 /// bits of this operand are used in the calculation.
178 /// \param __b
179 /// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
180 /// of this operand are used in the calculation.
181 /// \returns A 128-bit vector of [4 x float] containing the quotients of the
182 /// lower 32 bits of both operands. The upper 96 bits are copied from the
183 /// upper 96 bits of the first source operand.
184 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ss(__m128 __a,__m128 __b)185 _mm_div_ss(__m128 __a, __m128 __b)
186 {
187 __a[0] /= __b[0];
188 return __a;
189 }
190
191 /// Divides two 128-bit vectors of [4 x float].
192 ///
193 /// \headerfile <x86intrin.h>
194 ///
195 /// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
196 ///
197 /// \param __a
198 /// A 128-bit vector of [4 x float] containing the dividend.
199 /// \param __b
200 /// A 128-bit vector of [4 x float] containing the divisor.
201 /// \returns A 128-bit vector of [4 x float] containing the quotients of both
202 /// operands.
203 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_div_ps(__m128 __a,__m128 __b)204 _mm_div_ps(__m128 __a, __m128 __b)
205 {
206 return (__m128)((__v4sf)__a / (__v4sf)__b);
207 }
208
209 /// Calculates the square root of the value stored in the low-order bits
210 /// of a 128-bit vector of [4 x float].
211 ///
212 /// \headerfile <x86intrin.h>
213 ///
214 /// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
215 ///
216 /// \param __a
217 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
218 /// used in the calculation.
219 /// \returns A 128-bit vector of [4 x float] containing the square root of the
220 /// value in the low-order bits of the operand.
221 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ss(__m128 __a)222 _mm_sqrt_ss(__m128 __a)
223 {
224 return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
225 }
226
227 /// Calculates the square roots of the values stored in a 128-bit vector
228 /// of [4 x float].
229 ///
230 /// \headerfile <x86intrin.h>
231 ///
232 /// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
233 ///
234 /// \param __a
235 /// A 128-bit vector of [4 x float].
236 /// \returns A 128-bit vector of [4 x float] containing the square roots of the
237 /// values in the operand.
238 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_sqrt_ps(__m128 __a)239 _mm_sqrt_ps(__m128 __a)
240 {
241 return __builtin_ia32_sqrtps((__v4sf)__a);
242 }
243
244 /// Calculates the approximate reciprocal of the value stored in the
245 /// low-order bits of a 128-bit vector of [4 x float].
246 ///
247 /// \headerfile <x86intrin.h>
248 ///
249 /// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
250 ///
251 /// \param __a
252 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
253 /// used in the calculation.
254 /// \returns A 128-bit vector of [4 x float] containing the approximate
255 /// reciprocal of the value in the low-order bits of the operand.
256 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ss(__m128 __a)257 _mm_rcp_ss(__m128 __a)
258 {
259 return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
260 }
261
262 /// Calculates the approximate reciprocals of the values stored in a
263 /// 128-bit vector of [4 x float].
264 ///
265 /// \headerfile <x86intrin.h>
266 ///
267 /// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
268 ///
269 /// \param __a
270 /// A 128-bit vector of [4 x float].
271 /// \returns A 128-bit vector of [4 x float] containing the approximate
272 /// reciprocals of the values in the operand.
273 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rcp_ps(__m128 __a)274 _mm_rcp_ps(__m128 __a)
275 {
276 return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
277 }
278
279 /// Calculates the approximate reciprocal of the square root of the value
280 /// stored in the low-order bits of a 128-bit vector of [4 x float].
281 ///
282 /// \headerfile <x86intrin.h>
283 ///
284 /// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
285 ///
286 /// \param __a
287 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
288 /// used in the calculation.
289 /// \returns A 128-bit vector of [4 x float] containing the approximate
290 /// reciprocal of the square root of the value in the low-order bits of the
291 /// operand.
292 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ss(__m128 __a)293 _mm_rsqrt_ss(__m128 __a)
294 {
295 return __builtin_ia32_rsqrtss((__v4sf)__a);
296 }
297
298 /// Calculates the approximate reciprocals of the square roots of the
299 /// values stored in a 128-bit vector of [4 x float].
300 ///
301 /// \headerfile <x86intrin.h>
302 ///
303 /// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
304 ///
305 /// \param __a
306 /// A 128-bit vector of [4 x float].
307 /// \returns A 128-bit vector of [4 x float] containing the approximate
308 /// reciprocals of the square roots of the values in the operand.
309 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_rsqrt_ps(__m128 __a)310 _mm_rsqrt_ps(__m128 __a)
311 {
312 return __builtin_ia32_rsqrtps((__v4sf)__a);
313 }
314
315 /// Compares two 32-bit float values in the low-order bits of both
316 /// operands and returns the lesser value in the low-order bits of the
317 /// vector of [4 x float].
318 ///
319 /// If either value in a comparison is NaN, returns the value from \a __b.
320 ///
321 /// \headerfile <x86intrin.h>
322 ///
323 /// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
324 ///
325 /// \param __a
326 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
327 /// 32 bits of this operand are used in the comparison.
328 /// \param __b
329 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
330 /// 32 bits of this operand are used in the comparison.
331 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
332 /// minimum value between both operands. The upper 96 bits are copied from
333 /// the upper 96 bits of the first source operand.
334 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ss(__m128 __a,__m128 __b)335 _mm_min_ss(__m128 __a, __m128 __b)
336 {
337 return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
338 }
339
340 /// Compares two 128-bit vectors of [4 x float] and returns the lesser
341 /// of each pair of values.
342 ///
343 /// If either value in a comparison is NaN, returns the value from \a __b.
344 ///
345 /// \headerfile <x86intrin.h>
346 ///
347 /// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
348 ///
349 /// \param __a
350 /// A 128-bit vector of [4 x float] containing one of the operands.
351 /// \param __b
352 /// A 128-bit vector of [4 x float] containing one of the operands.
353 /// \returns A 128-bit vector of [4 x float] containing the minimum values
354 /// between both operands.
355 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_min_ps(__m128 __a,__m128 __b)356 _mm_min_ps(__m128 __a, __m128 __b)
357 {
358 return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
359 }
360
361 /// Compares two 32-bit float values in the low-order bits of both
362 /// operands and returns the greater value in the low-order bits of a 128-bit
363 /// vector of [4 x float].
364 ///
365 /// If either value in a comparison is NaN, returns the value from \a __b.
366 ///
367 /// \headerfile <x86intrin.h>
368 ///
369 /// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
370 ///
371 /// \param __a
372 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
373 /// 32 bits of this operand are used in the comparison.
374 /// \param __b
375 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
376 /// 32 bits of this operand are used in the comparison.
377 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
378 /// maximum value between both operands. The upper 96 bits are copied from
379 /// the upper 96 bits of the first source operand.
380 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ss(__m128 __a,__m128 __b)381 _mm_max_ss(__m128 __a, __m128 __b)
382 {
383 return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
384 }
385
386 /// Compares two 128-bit vectors of [4 x float] and returns the greater
387 /// of each pair of values.
388 ///
389 /// If either value in a comparison is NaN, returns the value from \a __b.
390 ///
391 /// \headerfile <x86intrin.h>
392 ///
393 /// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
394 ///
395 /// \param __a
396 /// A 128-bit vector of [4 x float] containing one of the operands.
397 /// \param __b
398 /// A 128-bit vector of [4 x float] containing one of the operands.
399 /// \returns A 128-bit vector of [4 x float] containing the maximum values
400 /// between both operands.
401 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_max_ps(__m128 __a,__m128 __b)402 _mm_max_ps(__m128 __a, __m128 __b)
403 {
404 return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
405 }
406
407 /// Performs a bitwise AND of two 128-bit vectors of [4 x float].
408 ///
409 /// \headerfile <x86intrin.h>
410 ///
411 /// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
412 ///
413 /// \param __a
414 /// A 128-bit vector containing one of the source operands.
415 /// \param __b
416 /// A 128-bit vector containing one of the source operands.
417 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
418 /// values between both operands.
419 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_and_ps(__m128 __a,__m128 __b)420 _mm_and_ps(__m128 __a, __m128 __b)
421 {
422 return (__m128)((__v4su)__a & (__v4su)__b);
423 }
424
425 /// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
426 /// the one's complement of the values contained in the first source
427 /// operand.
428 ///
429 /// \headerfile <x86intrin.h>
430 ///
431 /// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
432 ///
433 /// \param __a
434 /// A 128-bit vector of [4 x float] containing the first source operand. The
435 /// one's complement of this value is used in the bitwise AND.
436 /// \param __b
437 /// A 128-bit vector of [4 x float] containing the second source operand.
438 /// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
439 /// one's complement of the first operand and the values in the second
440 /// operand.
441 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_andnot_ps(__m128 __a,__m128 __b)442 _mm_andnot_ps(__m128 __a, __m128 __b)
443 {
444 return (__m128)(~(__v4su)__a & (__v4su)__b);
445 }
446
447 /// Performs a bitwise OR of two 128-bit vectors of [4 x float].
448 ///
449 /// \headerfile <x86intrin.h>
450 ///
451 /// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
452 ///
453 /// \param __a
454 /// A 128-bit vector of [4 x float] containing one of the source operands.
455 /// \param __b
456 /// A 128-bit vector of [4 x float] containing one of the source operands.
457 /// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
458 /// values between both operands.
459 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_or_ps(__m128 __a,__m128 __b)460 _mm_or_ps(__m128 __a, __m128 __b)
461 {
462 return (__m128)((__v4su)__a | (__v4su)__b);
463 }
464
465 /// Performs a bitwise exclusive OR of two 128-bit vectors of
466 /// [4 x float].
467 ///
468 /// \headerfile <x86intrin.h>
469 ///
470 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
471 ///
472 /// \param __a
473 /// A 128-bit vector of [4 x float] containing one of the source operands.
474 /// \param __b
475 /// A 128-bit vector of [4 x float] containing one of the source operands.
476 /// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
477 /// of the values between both operands.
478 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_xor_ps(__m128 __a,__m128 __b)479 _mm_xor_ps(__m128 __a, __m128 __b)
480 {
481 return (__m128)((__v4su)__a ^ (__v4su)__b);
482 }
483
484 /// Compares two 32-bit float values in the low-order bits of both
485 /// operands for equality.
486 ///
487 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
488 /// low-order bits of a vector [4 x float].
489 /// If either value in a comparison is NaN, returns false.
490 ///
491 /// \headerfile <x86intrin.h>
492 ///
493 /// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
494 ///
495 /// \param __a
496 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
497 /// 32 bits of this operand are used in the comparison.
498 /// \param __b
499 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
500 /// 32 bits of this operand are used in the comparison.
501 /// \returns A 128-bit vector of [4 x float] containing the comparison results
502 /// in the low-order bits.
503 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ss(__m128 __a,__m128 __b)504 _mm_cmpeq_ss(__m128 __a, __m128 __b)
505 {
506 return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
507 }
508
509 /// Compares each of the corresponding 32-bit float values of the
510 /// 128-bit vectors of [4 x float] for equality.
511 ///
512 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
513 /// If either value in a comparison is NaN, returns false.
514 ///
515 /// \headerfile <x86intrin.h>
516 ///
517 /// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
518 ///
519 /// \param __a
520 /// A 128-bit vector of [4 x float].
521 /// \param __b
522 /// A 128-bit vector of [4 x float].
523 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
524 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpeq_ps(__m128 __a,__m128 __b)525 _mm_cmpeq_ps(__m128 __a, __m128 __b)
526 {
527 return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
528 }
529
530 /// Compares two 32-bit float values in the low-order bits of both
531 /// operands to determine if the value in the first operand is less than the
532 /// corresponding value in the second operand.
533 ///
534 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
535 /// low-order bits of a vector of [4 x float].
536 /// If either value in a comparison is NaN, returns false.
537 ///
538 /// \headerfile <x86intrin.h>
539 ///
540 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
541 ///
542 /// \param __a
543 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
544 /// 32 bits of this operand are used in the comparison.
545 /// \param __b
546 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
547 /// 32 bits of this operand are used in the comparison.
548 /// \returns A 128-bit vector of [4 x float] containing the comparison results
549 /// in the low-order bits.
550 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ss(__m128 __a,__m128 __b)551 _mm_cmplt_ss(__m128 __a, __m128 __b)
552 {
553 return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
554 }
555
556 /// Compares each of the corresponding 32-bit float values of the
557 /// 128-bit vectors of [4 x float] to determine if the values in the first
558 /// operand are less than those in the second operand.
559 ///
560 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
561 /// If either value in a comparison is NaN, returns false.
562 ///
563 /// \headerfile <x86intrin.h>
564 ///
565 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
566 ///
567 /// \param __a
568 /// A 128-bit vector of [4 x float].
569 /// \param __b
570 /// A 128-bit vector of [4 x float].
571 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
572 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmplt_ps(__m128 __a,__m128 __b)573 _mm_cmplt_ps(__m128 __a, __m128 __b)
574 {
575 return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
576 }
577
578 /// Compares two 32-bit float values in the low-order bits of both
579 /// operands to determine if the value in the first operand is less than or
580 /// equal to the corresponding value in the second operand.
581 ///
582 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true, in
583 /// the low-order bits of a vector of [4 x float].
584 /// If either value in a comparison is NaN, returns false.
585 ///
586 /// \headerfile <x86intrin.h>
587 ///
588 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
589 ///
590 /// \param __a
591 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
592 /// 32 bits of this operand are used in the comparison.
593 /// \param __b
594 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
595 /// 32 bits of this operand are used in the comparison.
596 /// \returns A 128-bit vector of [4 x float] containing the comparison results
597 /// in the low-order bits.
598 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ss(__m128 __a,__m128 __b)599 _mm_cmple_ss(__m128 __a, __m128 __b)
600 {
601 return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
602 }
603
604 /// Compares each of the corresponding 32-bit float values of the
605 /// 128-bit vectors of [4 x float] to determine if the values in the first
606 /// operand are less than or equal to those in the second operand.
607 ///
608 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
609 /// If either value in a comparison is NaN, returns false.
610 ///
611 /// \headerfile <x86intrin.h>
612 ///
613 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
614 ///
615 /// \param __a
616 /// A 128-bit vector of [4 x float].
617 /// \param __b
618 /// A 128-bit vector of [4 x float].
619 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
620 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmple_ps(__m128 __a,__m128 __b)621 _mm_cmple_ps(__m128 __a, __m128 __b)
622 {
623 return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
624 }
625
626 /// Compares two 32-bit float values in the low-order bits of both
627 /// operands to determine if the value in the first operand is greater than
628 /// the corresponding value in the second operand.
629 ///
630 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
631 /// low-order bits of a vector of [4 x float].
632 /// If either value in a comparison is NaN, returns false.
633 ///
634 /// \headerfile <x86intrin.h>
635 ///
636 /// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
637 ///
638 /// \param __a
639 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
640 /// 32 bits of this operand are used in the comparison.
641 /// \param __b
642 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
643 /// 32 bits of this operand are used in the comparison.
644 /// \returns A 128-bit vector of [4 x float] containing the comparison results
645 /// in the low-order bits.
646 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ss(__m128 __a,__m128 __b)647 _mm_cmpgt_ss(__m128 __a, __m128 __b)
648 {
649 return (__m128)__builtin_shufflevector((__v4sf)__a,
650 (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
651 4, 1, 2, 3);
652 }
653
654 /// Compares each of the corresponding 32-bit float values of the
655 /// 128-bit vectors of [4 x float] to determine if the values in the first
656 /// operand are greater than those in the second operand.
657 ///
658 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
659 /// If either value in a comparison is NaN, returns false.
660 ///
661 /// \headerfile <x86intrin.h>
662 ///
663 /// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
664 ///
665 /// \param __a
666 /// A 128-bit vector of [4 x float].
667 /// \param __b
668 /// A 128-bit vector of [4 x float].
669 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
670 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpgt_ps(__m128 __a,__m128 __b)671 _mm_cmpgt_ps(__m128 __a, __m128 __b)
672 {
673 return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
674 }
675
676 /// Compares two 32-bit float values in the low-order bits of both
677 /// operands to determine if the value in the first operand is greater than
678 /// or equal to the corresponding value in the second operand.
679 ///
680 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
681 /// low-order bits of a vector of [4 x float].
682 /// If either value in a comparison is NaN, returns false.
683 ///
684 /// \headerfile <x86intrin.h>
685 ///
686 /// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
687 ///
688 /// \param __a
689 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
690 /// 32 bits of this operand are used in the comparison.
691 /// \param __b
692 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
693 /// 32 bits of this operand are used in the comparison.
694 /// \returns A 128-bit vector of [4 x float] containing the comparison results
695 /// in the low-order bits.
696 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ss(__m128 __a,__m128 __b)697 _mm_cmpge_ss(__m128 __a, __m128 __b)
698 {
699 return (__m128)__builtin_shufflevector((__v4sf)__a,
700 (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
701 4, 1, 2, 3);
702 }
703
704 /// Compares each of the corresponding 32-bit float values of the
705 /// 128-bit vectors of [4 x float] to determine if the values in the first
706 /// operand are greater than or equal to those in the second operand.
707 ///
708 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
709 /// If either value in a comparison is NaN, returns false.
710 ///
711 /// \headerfile <x86intrin.h>
712 ///
713 /// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
714 ///
715 /// \param __a
716 /// A 128-bit vector of [4 x float].
717 /// \param __b
718 /// A 128-bit vector of [4 x float].
719 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
720 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpge_ps(__m128 __a,__m128 __b)721 _mm_cmpge_ps(__m128 __a, __m128 __b)
722 {
723 return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
724 }
725
726 /// Compares two 32-bit float values in the low-order bits of both operands
727 /// for inequality.
728 ///
729 /// The comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
730 /// low-order bits of a vector of [4 x float].
731 /// If either value in a comparison is NaN, returns true.
732 ///
733 /// \headerfile <x86intrin.h>
734 ///
735 /// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
736 /// instructions.
737 ///
738 /// \param __a
739 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
740 /// 32 bits of this operand are used in the comparison.
741 /// \param __b
742 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
743 /// 32 bits of this operand are used in the comparison.
744 /// \returns A 128-bit vector of [4 x float] containing the comparison results
745 /// in the low-order bits.
746 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ss(__m128 __a,__m128 __b)747 _mm_cmpneq_ss(__m128 __a, __m128 __b)
748 {
749 return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
750 }
751
752 /// Compares each of the corresponding 32-bit float values of the
753 /// 128-bit vectors of [4 x float] for inequality.
754 ///
755 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
756 /// If either value in a comparison is NaN, returns true.
757 ///
758 /// \headerfile <x86intrin.h>
759 ///
760 /// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
761 /// instructions.
762 ///
763 /// \param __a
764 /// A 128-bit vector of [4 x float].
765 /// \param __b
766 /// A 128-bit vector of [4 x float].
767 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
768 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpneq_ps(__m128 __a,__m128 __b)769 _mm_cmpneq_ps(__m128 __a, __m128 __b)
770 {
771 return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
772 }
773
774 /// Compares two 32-bit float values in the low-order bits of both
775 /// operands to determine if the value in the first operand is not less than
776 /// the corresponding value in the second operand.
777 ///
778 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
779 /// low-order bits of a vector of [4 x float].
780 /// If either value in a comparison is NaN, returns true.
781 ///
782 /// \headerfile <x86intrin.h>
783 ///
784 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
785 /// instructions.
786 ///
787 /// \param __a
788 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
789 /// 32 bits of this operand are used in the comparison.
790 /// \param __b
791 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
792 /// 32 bits of this operand are used in the comparison.
793 /// \returns A 128-bit vector of [4 x float] containing the comparison results
794 /// in the low-order bits.
795 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ss(__m128 __a,__m128 __b)796 _mm_cmpnlt_ss(__m128 __a, __m128 __b)
797 {
798 return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
799 }
800
801 /// Compares each of the corresponding 32-bit float values of the
802 /// 128-bit vectors of [4 x float] to determine if the values in the first
803 /// operand are not less than those in the second operand.
804 ///
805 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
806 /// If either value in a comparison is NaN, returns true.
807 ///
808 /// \headerfile <x86intrin.h>
809 ///
810 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
811 /// instructions.
812 ///
813 /// \param __a
814 /// A 128-bit vector of [4 x float].
815 /// \param __b
816 /// A 128-bit vector of [4 x float].
817 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
818 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnlt_ps(__m128 __a,__m128 __b)819 _mm_cmpnlt_ps(__m128 __a, __m128 __b)
820 {
821 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
822 }
823
824 /// Compares two 32-bit float values in the low-order bits of both
825 /// operands to determine if the value in the first operand is not less than
826 /// or equal to the corresponding value in the second operand.
827 ///
828 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
829 /// low-order bits of a vector of [4 x float].
830 /// If either value in a comparison is NaN, returns true.
831 ///
832 /// \headerfile <x86intrin.h>
833 ///
834 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
835 /// instructions.
836 ///
837 /// \param __a
838 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
839 /// 32 bits of this operand are used in the comparison.
840 /// \param __b
841 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
842 /// 32 bits of this operand are used in the comparison.
843 /// \returns A 128-bit vector of [4 x float] containing the comparison results
844 /// in the low-order bits.
845 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ss(__m128 __a,__m128 __b)846 _mm_cmpnle_ss(__m128 __a, __m128 __b)
847 {
848 return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
849 }
850
851 /// Compares each of the corresponding 32-bit float values of the
852 /// 128-bit vectors of [4 x float] to determine if the values in the first
853 /// operand are not less than or equal to those in the second operand.
854 ///
855 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
856 /// If either value in a comparison is NaN, returns true.
857 ///
858 /// \headerfile <x86intrin.h>
859 ///
860 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
861 /// instructions.
862 ///
863 /// \param __a
864 /// A 128-bit vector of [4 x float].
865 /// \param __b
866 /// A 128-bit vector of [4 x float].
867 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
868 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnle_ps(__m128 __a,__m128 __b)869 _mm_cmpnle_ps(__m128 __a, __m128 __b)
870 {
871 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
872 }
873
874 /// Compares two 32-bit float values in the low-order bits of both
875 /// operands to determine if the value in the first operand is not greater
876 /// than the corresponding value in the second operand.
877 ///
878 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
879 /// low-order bits of a vector of [4 x float].
880 /// If either value in a comparison is NaN, returns true.
881 ///
882 /// \headerfile <x86intrin.h>
883 ///
884 /// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
885 /// instructions.
886 ///
887 /// \param __a
888 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
889 /// 32 bits of this operand are used in the comparison.
890 /// \param __b
891 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
892 /// 32 bits of this operand are used in the comparison.
893 /// \returns A 128-bit vector of [4 x float] containing the comparison results
894 /// in the low-order bits.
895 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ss(__m128 __a,__m128 __b)896 _mm_cmpngt_ss(__m128 __a, __m128 __b)
897 {
898 return (__m128)__builtin_shufflevector((__v4sf)__a,
899 (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
900 4, 1, 2, 3);
901 }
902
903 /// Compares each of the corresponding 32-bit float values of the
904 /// 128-bit vectors of [4 x float] to determine if the values in the first
905 /// operand are not greater than those in the second operand.
906 ///
907 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
908 /// If either value in a comparison is NaN, returns true.
909 ///
910 /// \headerfile <x86intrin.h>
911 ///
912 /// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
913 /// instructions.
914 ///
915 /// \param __a
916 /// A 128-bit vector of [4 x float].
917 /// \param __b
918 /// A 128-bit vector of [4 x float].
919 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
920 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpngt_ps(__m128 __a,__m128 __b)921 _mm_cmpngt_ps(__m128 __a, __m128 __b)
922 {
923 return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
924 }
925
926 /// Compares two 32-bit float values in the low-order bits of both
927 /// operands to determine if the value in the first operand is not greater
928 /// than or equal to the corresponding value in the second operand.
929 ///
930 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true, in the
931 /// low-order bits of a vector of [4 x float].
932 /// If either value in a comparison is NaN, returns true.
933 ///
934 /// \headerfile <x86intrin.h>
935 ///
936 /// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
937 /// instructions.
938 ///
939 /// \param __a
940 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
941 /// 32 bits of this operand are used in the comparison.
942 /// \param __b
943 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
944 /// 32 bits of this operand are used in the comparison.
945 /// \returns A 128-bit vector of [4 x float] containing the comparison results
946 /// in the low-order bits.
947 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ss(__m128 __a,__m128 __b)948 _mm_cmpnge_ss(__m128 __a, __m128 __b)
949 {
950 return (__m128)__builtin_shufflevector((__v4sf)__a,
951 (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
952 4, 1, 2, 3);
953 }
954
955 /// Compares each of the corresponding 32-bit float values of the
956 /// 128-bit vectors of [4 x float] to determine if the values in the first
957 /// operand are not greater than or equal to those in the second operand.
958 ///
959 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
960 /// If either value in a comparison is NaN, returns true.
961 ///
962 /// \headerfile <x86intrin.h>
963 ///
964 /// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
965 /// instructions.
966 ///
967 /// \param __a
968 /// A 128-bit vector of [4 x float].
969 /// \param __b
970 /// A 128-bit vector of [4 x float].
971 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
972 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpnge_ps(__m128 __a,__m128 __b)973 _mm_cmpnge_ps(__m128 __a, __m128 __b)
974 {
975 return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
976 }
977
978 /// Compares two 32-bit float values in the low-order bits of both
979 /// operands to determine if the value in the first operand is ordered with
980 /// respect to the corresponding value in the second operand.
981 ///
982 /// A pair of floating-point values are ordered with respect to each
983 /// other if neither value is a NaN. Each comparison returns 0x0 for false,
984 /// 0xFFFFFFFF for true.
985 ///
986 /// \headerfile <x86intrin.h>
987 ///
988 /// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
989 /// instructions.
990 ///
991 /// \param __a
992 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
993 /// 32 bits of this operand are used in the comparison.
994 /// \param __b
995 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
996 /// 32 bits of this operand are used in the comparison.
997 /// \returns A 128-bit vector of [4 x float] containing the comparison results
998 /// in the low-order bits.
999 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ss(__m128 __a,__m128 __b)1000 _mm_cmpord_ss(__m128 __a, __m128 __b)
1001 {
1002 return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
1003 }
1004
1005 /// Compares each of the corresponding 32-bit float values of the
1006 /// 128-bit vectors of [4 x float] to determine if the values in the first
1007 /// operand are ordered with respect to those in the second operand.
1008 ///
1009 /// A pair of floating-point values are ordered with respect to each
1010 /// other if neither value is a NaN. Each comparison returns 0x0 for false,
1011 /// 0xFFFFFFFF for true.
1012 ///
1013 /// \headerfile <x86intrin.h>
1014 ///
1015 /// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
1016 /// instructions.
1017 ///
1018 /// \param __a
1019 /// A 128-bit vector of [4 x float].
1020 /// \param __b
1021 /// A 128-bit vector of [4 x float].
1022 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1023 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpord_ps(__m128 __a,__m128 __b)1024 _mm_cmpord_ps(__m128 __a, __m128 __b)
1025 {
1026 return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
1027 }
1028
1029 /// Compares two 32-bit float values in the low-order bits of both
1030 /// operands to determine if the value in the first operand is unordered
1031 /// with respect to the corresponding value in the second operand.
1032 ///
1033 /// A pair of double-precision values are unordered with respect to each
1034 /// other if one or both values are NaN. Each comparison returns 0x0 for
1035 /// false, 0xFFFFFFFF for true.
1036 ///
1037 /// \headerfile <x86intrin.h>
1038 ///
1039 /// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
1040 /// instructions.
1041 ///
1042 /// \param __a
1043 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
1044 /// 32 bits of this operand are used in the comparison.
1045 /// \param __b
1046 /// A 128-bit vector of [4 x float] containing one of the operands. The lower
1047 /// 32 bits of this operand are used in the comparison.
1048 /// \returns A 128-bit vector of [4 x float] containing the comparison results
1049 /// in the low-order bits.
1050 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ss(__m128 __a,__m128 __b)1051 _mm_cmpunord_ss(__m128 __a, __m128 __b)
1052 {
1053 return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
1054 }
1055
1056 /// Compares each of the corresponding 32-bit float values of the
1057 /// 128-bit vectors of [4 x float] to determine if the values in the first
1058 /// operand are unordered with respect to those in the second operand.
1059 ///
1060 /// A pair of double-precision values are unordered with respect to each
1061 /// other if one or both values are NaN. Each comparison returns 0x0 for
1062 /// false, 0xFFFFFFFFFFFFFFFF for true.
1063 ///
1064 /// \headerfile <x86intrin.h>
1065 ///
1066 /// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
1067 /// instructions.
1068 ///
1069 /// \param __a
1070 /// A 128-bit vector of [4 x float].
1071 /// \param __b
1072 /// A 128-bit vector of [4 x float].
1073 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
1074 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cmpunord_ps(__m128 __a,__m128 __b)1075 _mm_cmpunord_ps(__m128 __a, __m128 __b)
1076 {
1077 return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
1078 }
1079
1080 /// Compares two 32-bit float values in the low-order bits of both
1081 /// operands for equality.
1082 ///
1083 /// The comparison returns 0 for false, 1 for true. If either value in a
1084 /// comparison is NaN, returns 0.
1085 ///
1086 /// \headerfile <x86intrin.h>
1087 ///
1088 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1089 /// instructions.
1090 ///
1091 /// \param __a
1092 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1093 /// used in the comparison.
1094 /// \param __b
1095 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1096 /// used in the comparison.
1097 /// \returns An integer containing the comparison results.
1098 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comieq_ss(__m128 __a,__m128 __b)1099 _mm_comieq_ss(__m128 __a, __m128 __b)
1100 {
1101 return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
1102 }
1103
1104 /// Compares two 32-bit float values in the low-order bits of both
1105 /// operands to determine if the first operand is less than the second
1106 /// operand.
1107 ///
1108 /// The comparison returns 0 for false, 1 for true. If either value in a
1109 /// comparison is NaN, returns 0.
1110 ///
1111 /// \headerfile <x86intrin.h>
1112 ///
1113 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
1114 /// instructions.
1115 ///
1116 /// \param __a
1117 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1118 /// used in the comparison.
1119 /// \param __b
1120 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1121 /// used in the comparison.
1122 /// \returns An integer containing the comparison results.
1123 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comilt_ss(__m128 __a,__m128 __b)1124 _mm_comilt_ss(__m128 __a, __m128 __b)
1125 {
1126 return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
1127 }
1128
1129 /// Compares two 32-bit float values in the low-order bits of both
1130 /// operands to determine if the first operand is less than or equal to the
1131 /// second operand.
1132 ///
1133 /// The comparison returns 0 for false, 1 for true. If either value in a
1134 /// comparison is NaN, returns 0.
1135 ///
1136 /// \headerfile <x86intrin.h>
1137 ///
1138 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1139 ///
1140 /// \param __a
1141 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1142 /// used in the comparison.
1143 /// \param __b
1144 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1145 /// used in the comparison.
1146 /// \returns An integer containing the comparison results.
1147 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comile_ss(__m128 __a,__m128 __b)1148 _mm_comile_ss(__m128 __a, __m128 __b)
1149 {
1150 return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
1151 }
1152
1153 /// Compares two 32-bit float values in the low-order bits of both
1154 /// operands to determine if the first operand is greater than the second
1155 /// operand.
1156 ///
1157 /// The comparison returns 0 for false, 1 for true. If either value in a
1158 /// comparison is NaN, returns 0.
1159 ///
1160 /// \headerfile <x86intrin.h>
1161 ///
1162 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1163 ///
1164 /// \param __a
1165 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1166 /// used in the comparison.
1167 /// \param __b
1168 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1169 /// used in the comparison.
1170 /// \returns An integer containing the comparison results.
1171 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comigt_ss(__m128 __a,__m128 __b)1172 _mm_comigt_ss(__m128 __a, __m128 __b)
1173 {
1174 return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
1175 }
1176
1177 /// Compares two 32-bit float values in the low-order bits of both
1178 /// operands to determine if the first operand is greater than or equal to
1179 /// the second operand.
1180 ///
1181 /// The comparison returns 0 for false, 1 for true. If either value in a
1182 /// comparison is NaN, returns 0.
1183 ///
1184 /// \headerfile <x86intrin.h>
1185 ///
1186 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1187 ///
1188 /// \param __a
1189 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1190 /// used in the comparison.
1191 /// \param __b
1192 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1193 /// used in the comparison.
1194 /// \returns An integer containing the comparison results.
1195 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comige_ss(__m128 __a,__m128 __b)1196 _mm_comige_ss(__m128 __a, __m128 __b)
1197 {
1198 return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
1199 }
1200
1201 /// Compares two 32-bit float values in the low-order bits of both
1202 /// operands to determine if the first operand is not equal to the second
1203 /// operand.
1204 ///
1205 /// The comparison returns 0 for false, 1 for true. If either value in a
1206 /// comparison is NaN, returns 1.
1207 ///
1208 /// \headerfile <x86intrin.h>
1209 ///
1210 /// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
1211 ///
1212 /// \param __a
1213 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1214 /// used in the comparison.
1215 /// \param __b
1216 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1217 /// used in the comparison.
1218 /// \returns An integer containing the comparison results.
1219 static __inline__ int __DEFAULT_FN_ATTRS
_mm_comineq_ss(__m128 __a,__m128 __b)1220 _mm_comineq_ss(__m128 __a, __m128 __b)
1221 {
1222 return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
1223 }
1224
1225 /// Performs an unordered comparison of two 32-bit float values using
1226 /// the low-order bits of both operands to determine equality.
1227 ///
1228 /// The comparison returns 0 for false, 1 for true. If either value in a
1229 /// comparison is NaN, returns 0.
1230 ///
1231 /// \headerfile <x86intrin.h>
1232 ///
1233 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1234 ///
1235 /// \param __a
1236 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1237 /// used in the comparison.
1238 /// \param __b
1239 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1240 /// used in the comparison.
1241 /// \returns An integer containing the comparison results.
1242 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomieq_ss(__m128 __a,__m128 __b)1243 _mm_ucomieq_ss(__m128 __a, __m128 __b)
1244 {
1245 return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
1246 }
1247
1248 /// Performs an unordered comparison of two 32-bit float values using
1249 /// the low-order bits of both operands to determine if the first operand is
1250 /// less than the second operand.
1251 ///
1252 /// The comparison returns 0 for false, 1 for true. If either value in a
1253 /// comparison is NaN, returns 0.
1254 ///
1255 /// \headerfile <x86intrin.h>
1256 ///
1257 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1258 ///
1259 /// \param __a
1260 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1261 /// used in the comparison.
1262 /// \param __b
1263 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1264 /// used in the comparison.
1265 /// \returns An integer containing the comparison results.
1266 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomilt_ss(__m128 __a,__m128 __b)1267 _mm_ucomilt_ss(__m128 __a, __m128 __b)
1268 {
1269 return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
1270 }
1271
1272 /// Performs an unordered comparison of two 32-bit float values using
1273 /// the low-order bits of both operands to determine if the first operand is
1274 /// less than or equal to the second operand.
1275 ///
1276 /// The comparison returns 0 for false, 1 for true. If either value in a
1277 /// comparison is NaN, returns 0.
1278 ///
1279 /// \headerfile <x86intrin.h>
1280 ///
1281 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1282 ///
1283 /// \param __a
1284 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1285 /// used in the comparison.
1286 /// \param __b
1287 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1288 /// used in the comparison.
1289 /// \returns An integer containing the comparison results.
1290 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomile_ss(__m128 __a,__m128 __b)1291 _mm_ucomile_ss(__m128 __a, __m128 __b)
1292 {
1293 return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
1294 }
1295
1296 /// Performs an unordered comparison of two 32-bit float values using
1297 /// the low-order bits of both operands to determine if the first operand is
1298 /// greater than the second operand.
1299 ///
1300 /// The comparison returns 0 for false, 1 for true. If either value in a
1301 /// comparison is NaN, returns 0.
1302 ///
1303 /// \headerfile <x86intrin.h>
1304 ///
1305 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1306 ///
1307 /// \param __a
1308 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1309 /// used in the comparison.
1310 /// \param __b
1311 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1312 /// used in the comparison.
1313 /// \returns An integer containing the comparison results.
1314 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomigt_ss(__m128 __a,__m128 __b)1315 _mm_ucomigt_ss(__m128 __a, __m128 __b)
1316 {
1317 return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
1318 }
1319
1320 /// Performs an unordered comparison of two 32-bit float values using
1321 /// the low-order bits of both operands to determine if the first operand is
1322 /// greater than or equal to the second operand.
1323 ///
1324 /// The comparison returns 0 for false, 1 for true. If either value in a
1325 /// comparison is NaN, returns 0.
1326 ///
1327 /// \headerfile <x86intrin.h>
1328 ///
1329 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1330 ///
1331 /// \param __a
1332 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1333 /// used in the comparison.
1334 /// \param __b
1335 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1336 /// used in the comparison.
1337 /// \returns An integer containing the comparison results.
1338 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomige_ss(__m128 __a,__m128 __b)1339 _mm_ucomige_ss(__m128 __a, __m128 __b)
1340 {
1341 return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
1342 }
1343
1344 /// Performs an unordered comparison of two 32-bit float values using
1345 /// the low-order bits of both operands to determine inequality.
1346 ///
1347 /// The comparison returns 0 for false, 1 for true. If either value in a
1348 /// comparison is NaN, returns 0.
1349 ///
1350 /// \headerfile <x86intrin.h>
1351 ///
1352 /// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
1353 ///
1354 /// \param __a
1355 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1356 /// used in the comparison.
1357 /// \param __b
1358 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1359 /// used in the comparison.
1360 /// \returns An integer containing the comparison results.
1361 static __inline__ int __DEFAULT_FN_ATTRS
_mm_ucomineq_ss(__m128 __a,__m128 __b)1362 _mm_ucomineq_ss(__m128 __a, __m128 __b)
1363 {
1364 return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
1365 }
1366
1367 /// Converts a float value contained in the lower 32 bits of a vector of
1368 /// [4 x float] into a 32-bit integer.
1369 ///
1370 /// If the converted value does not fit in a 32-bit integer, raises a
1371 /// floating-point invalid exception. If the exception is masked, returns
1372 /// the most negative integer.
1373 ///
1374 /// \headerfile <x86intrin.h>
1375 ///
1376 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1377 /// instructions.
1378 ///
1379 /// \param __a
1380 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1381 /// used in the conversion.
1382 /// \returns A 32-bit integer containing the converted value.
1383 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtss_si32(__m128 __a)1384 _mm_cvtss_si32(__m128 __a)
1385 {
1386 return __builtin_ia32_cvtss2si((__v4sf)__a);
1387 }
1388
1389 /// Converts a float value contained in the lower 32 bits of a vector of
1390 /// [4 x float] into a 32-bit integer.
1391 ///
1392 /// If the converted value does not fit in a 32-bit integer, raises a
1393 /// floating-point invalid exception. If the exception is masked, returns
1394 /// the most negative integer.
1395 ///
1396 /// \headerfile <x86intrin.h>
1397 ///
1398 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1399 /// instructions.
1400 ///
1401 /// \param __a
1402 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1403 /// used in the conversion.
1404 /// \returns A 32-bit integer containing the converted value.
1405 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvt_ss2si(__m128 __a)1406 _mm_cvt_ss2si(__m128 __a)
1407 {
1408 return _mm_cvtss_si32(__a);
1409 }
1410
1411 #ifdef __x86_64__
1412
1413 /// Converts a float value contained in the lower 32 bits of a vector of
1414 /// [4 x float] into a 64-bit integer.
1415 ///
1416 /// If the converted value does not fit in a 32-bit integer, raises a
1417 /// floating-point invalid exception. If the exception is masked, returns
1418 /// the most negative integer.
1419 ///
1420 /// \headerfile <x86intrin.h>
1421 ///
1422 /// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
1423 /// instructions.
1424 ///
1425 /// \param __a
1426 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1427 /// used in the conversion.
1428 /// \returns A 64-bit integer containing the converted value.
1429 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvtss_si64(__m128 __a)1430 _mm_cvtss_si64(__m128 __a)
1431 {
1432 return __builtin_ia32_cvtss2si64((__v4sf)__a);
1433 }
1434
1435 #endif
1436
1437 /// Converts two low-order float values in a 128-bit vector of
1438 /// [4 x float] into a 64-bit vector of [2 x i32].
1439 ///
1440 /// If a converted value does not fit in a 32-bit integer, raises a
1441 /// floating-point invalid exception. If the exception is masked, returns
1442 /// the most negative integer.
1443 ///
1444 /// \headerfile <x86intrin.h>
1445 ///
1446 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1447 ///
1448 /// \param __a
1449 /// A 128-bit vector of [4 x float].
1450 /// \returns A 64-bit integer vector containing the converted values.
1451 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi32(__m128 __a)1452 _mm_cvtps_pi32(__m128 __a)
1453 {
1454 return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
1455 }
1456
1457 /// Converts two low-order float values in a 128-bit vector of
1458 /// [4 x float] into a 64-bit vector of [2 x i32].
1459 ///
1460 /// If a converted value does not fit in a 32-bit integer, raises a
1461 /// floating-point invalid exception. If the exception is masked, returns
1462 /// the most negative integer.
1463 ///
1464 /// \headerfile <x86intrin.h>
1465 ///
1466 /// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
1467 ///
1468 /// \param __a
1469 /// A 128-bit vector of [4 x float].
1470 /// \returns A 64-bit integer vector containing the converted values.
1471 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvt_ps2pi(__m128 __a)1472 _mm_cvt_ps2pi(__m128 __a)
1473 {
1474 return _mm_cvtps_pi32(__a);
1475 }
1476
1477 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1478 /// truncated (rounded toward zero) 32-bit integer.
1479 ///
1480 /// If the converted value does not fit in a 32-bit integer, raises a
1481 /// floating-point invalid exception. If the exception is masked, returns
1482 /// the most negative integer.
1483 ///
1484 /// \headerfile <x86intrin.h>
1485 ///
1486 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1487 /// instructions.
1488 ///
1489 /// \param __a
1490 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1491 /// used in the conversion.
1492 /// \returns A 32-bit integer containing the converted value.
1493 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvttss_si32(__m128 __a)1494 _mm_cvttss_si32(__m128 __a)
1495 {
1496 return __builtin_ia32_cvttss2si((__v4sf)__a);
1497 }
1498
1499 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1500 /// truncated (rounded toward zero) 32-bit integer.
1501 ///
1502 /// If the converted value does not fit in a 32-bit integer, raises a
1503 /// floating-point invalid exception. If the exception is masked, returns
1504 /// the most negative integer.
1505 ///
1506 /// \headerfile <x86intrin.h>
1507 ///
1508 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1509 /// instructions.
1510 ///
1511 /// \param __a
1512 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1513 /// used in the conversion.
1514 /// \returns A 32-bit integer containing the converted value.
1515 static __inline__ int __DEFAULT_FN_ATTRS
_mm_cvtt_ss2si(__m128 __a)1516 _mm_cvtt_ss2si(__m128 __a)
1517 {
1518 return _mm_cvttss_si32(__a);
1519 }
1520
1521 #ifdef __x86_64__
1522 /// Converts the lower (first) element of a vector of [4 x float] into a signed
1523 /// truncated (rounded toward zero) 64-bit integer.
1524 ///
1525 /// If the converted value does not fit in a 64-bit integer, raises a
1526 /// floating-point invalid exception. If the exception is masked, returns
1527 /// the most negative integer.
1528 ///
1529 /// \headerfile <x86intrin.h>
1530 ///
1531 /// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
1532 /// instructions.
1533 ///
1534 /// \param __a
1535 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1536 /// used in the conversion.
1537 /// \returns A 64-bit integer containing the converted value.
1538 static __inline__ long long __DEFAULT_FN_ATTRS
_mm_cvttss_si64(__m128 __a)1539 _mm_cvttss_si64(__m128 __a)
1540 {
1541 return __builtin_ia32_cvttss2si64((__v4sf)__a);
1542 }
1543 #endif
1544
1545 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1546 /// into two signed truncated (rounded toward zero) 32-bit integers,
1547 /// returned in a 64-bit vector of [2 x i32].
1548 ///
1549 /// If a converted value does not fit in a 32-bit integer, raises a
1550 /// floating-point invalid exception. If the exception is masked, returns
1551 /// the most negative integer.
1552 ///
1553 /// \headerfile <x86intrin.h>
1554 ///
1555 /// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
1556 /// instructions.
1557 ///
1558 /// \param __a
1559 /// A 128-bit vector of [4 x float].
1560 /// \returns A 64-bit integer vector containing the converted values.
1561 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvttps_pi32(__m128 __a)1562 _mm_cvttps_pi32(__m128 __a)
1563 {
1564 return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
1565 }
1566
1567 /// Converts the lower (first) two elements of a 128-bit vector of [4 x float]
1568 /// into two signed truncated (rounded toward zero) 64-bit integers,
1569 /// returned in a 64-bit vector of [2 x i32].
1570 ///
1571 /// If a converted value does not fit in a 32-bit integer, raises a
1572 /// floating-point invalid exception. If the exception is masked, returns
1573 /// the most negative integer.
1574 ///
1575 /// \headerfile <x86intrin.h>
1576 ///
1577 /// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
1578 ///
1579 /// \param __a
1580 /// A 128-bit vector of [4 x float].
1581 /// \returns A 64-bit integer vector containing the converted values.
1582 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtt_ps2pi(__m128 __a)1583 _mm_cvtt_ps2pi(__m128 __a)
1584 {
1585 return _mm_cvttps_pi32(__a);
1586 }
1587
1588 /// Converts a 32-bit signed integer value into a floating point value
1589 /// and writes it to the lower 32 bits of the destination. The remaining
1590 /// higher order elements of the destination vector are copied from the
1591 /// corresponding elements in the first operand.
1592 ///
1593 /// \headerfile <x86intrin.h>
1594 ///
1595 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1596 ///
1597 /// \param __a
1598 /// A 128-bit vector of [4 x float].
1599 /// \param __b
1600 /// A 32-bit signed integer operand containing the value to be converted.
1601 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1602 /// converted value of the second operand. The upper 96 bits are copied from
1603 /// the upper 96 bits of the first operand.
1604 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi32_ss(__m128 __a,int __b)1605 _mm_cvtsi32_ss(__m128 __a, int __b)
1606 {
1607 __a[0] = __b;
1608 return __a;
1609 }
1610
1611 /// Converts a 32-bit signed integer value into a floating point value
1612 /// and writes it to the lower 32 bits of the destination. The remaining
1613 /// higher order elements of the destination are copied from the
1614 /// corresponding elements in the first operand.
1615 ///
1616 /// \headerfile <x86intrin.h>
1617 ///
1618 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1619 ///
1620 /// \param __a
1621 /// A 128-bit vector of [4 x float].
1622 /// \param __b
1623 /// A 32-bit signed integer operand containing the value to be converted.
1624 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1625 /// converted value of the second operand. The upper 96 bits are copied from
1626 /// the upper 96 bits of the first operand.
1627 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvt_si2ss(__m128 __a,int __b)1628 _mm_cvt_si2ss(__m128 __a, int __b)
1629 {
1630 return _mm_cvtsi32_ss(__a, __b);
1631 }
1632
1633 #ifdef __x86_64__
1634
1635 /// Converts a 64-bit signed integer value into a floating point value
1636 /// and writes it to the lower 32 bits of the destination. The remaining
1637 /// higher order elements of the destination are copied from the
1638 /// corresponding elements in the first operand.
1639 ///
1640 /// \headerfile <x86intrin.h>
1641 ///
1642 /// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
1643 ///
1644 /// \param __a
1645 /// A 128-bit vector of [4 x float].
1646 /// \param __b
1647 /// A 64-bit signed integer operand containing the value to be converted.
1648 /// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
1649 /// converted value of the second operand. The upper 96 bits are copied from
1650 /// the upper 96 bits of the first operand.
1651 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_cvtsi64_ss(__m128 __a,long long __b)1652 _mm_cvtsi64_ss(__m128 __a, long long __b)
1653 {
1654 __a[0] = __b;
1655 return __a;
1656 }
1657
1658 #endif
1659
1660 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1661 /// floating point values and writes them to the lower 64-bits of the
1662 /// destination. The remaining higher order elements of the destination are
1663 /// copied from the corresponding elements in the first operand.
1664 ///
1665 /// \headerfile <x86intrin.h>
1666 ///
1667 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1668 ///
1669 /// \param __a
1670 /// A 128-bit vector of [4 x float].
1671 /// \param __b
1672 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1673 /// and written to the corresponding low-order elements in the destination.
1674 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1675 /// converted value of the second operand. The upper 64 bits are copied from
1676 /// the upper 64 bits of the first operand.
1677 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32_ps(__m128 __a,__m64 __b)1678 _mm_cvtpi32_ps(__m128 __a, __m64 __b)
1679 {
1680 return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
1681 }
1682
1683 /// Converts two elements of a 64-bit vector of [2 x i32] into two
1684 /// floating point values and writes them to the lower 64-bits of the
1685 /// destination. The remaining higher order elements of the destination are
1686 /// copied from the corresponding elements in the first operand.
1687 ///
1688 /// \headerfile <x86intrin.h>
1689 ///
1690 /// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
1691 ///
1692 /// \param __a
1693 /// A 128-bit vector of [4 x float].
1694 /// \param __b
1695 /// A 64-bit vector of [2 x i32]. The elements in this vector are converted
1696 /// and written to the corresponding low-order elements in the destination.
1697 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1698 /// converted value from the second operand. The upper 64 bits are copied
1699 /// from the upper 64 bits of the first operand.
1700 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvt_pi2ps(__m128 __a,__m64 __b)1701 _mm_cvt_pi2ps(__m128 __a, __m64 __b)
1702 {
1703 return _mm_cvtpi32_ps(__a, __b);
1704 }
1705
1706 /// Extracts a float value contained in the lower 32 bits of a vector of
1707 /// [4 x float].
1708 ///
1709 /// \headerfile <x86intrin.h>
1710 ///
1711 /// This intrinsic has no corresponding instruction.
1712 ///
1713 /// \param __a
1714 /// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
1715 /// used in the extraction.
1716 /// \returns A 32-bit float containing the extracted value.
1717 static __inline__ float __DEFAULT_FN_ATTRS
_mm_cvtss_f32(__m128 __a)1718 _mm_cvtss_f32(__m128 __a)
1719 {
1720 return __a[0];
1721 }
1722
1723 /// Loads two packed float values from the address \a __p into the
1724 /// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
1725 /// are copied from the low-order bits of the first operand.
1726 ///
1727 /// \headerfile <x86intrin.h>
1728 ///
1729 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1730 ///
1731 /// \param __a
1732 /// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
1733 /// of the destination.
1734 /// \param __p
1735 /// A pointer to two packed float values. Bits [63:0] are written to bits
1736 /// [127:64] of the destination.
1737 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1738 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadh_pi(__m128 __a,const __m64 * __p)1739 _mm_loadh_pi(__m128 __a, const __m64 *__p)
1740 {
1741 typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
1742 struct __mm_loadh_pi_struct {
1743 __mm_loadh_pi_v2f32 __u;
1744 } __attribute__((__packed__, __may_alias__));
1745 __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
1746 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1747 return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
1748 }
1749
1750 /// Loads two packed float values from the address \a __p into the
1751 /// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
1752 /// are copied from the high-order bits of the first operand.
1753 ///
1754 /// \headerfile <x86intrin.h>
1755 ///
1756 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1757 ///
1758 /// \param __a
1759 /// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
1760 /// [127:64] of the destination.
1761 /// \param __p
1762 /// A pointer to two packed float values. Bits [63:0] are written to bits
1763 /// [63:0] of the destination.
1764 /// \returns A 128-bit vector of [4 x float] containing the moved values.
1765 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadl_pi(__m128 __a,const __m64 * __p)1766 _mm_loadl_pi(__m128 __a, const __m64 *__p)
1767 {
1768 typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
1769 struct __mm_loadl_pi_struct {
1770 __mm_loadl_pi_v2f32 __u;
1771 } __attribute__((__packed__, __may_alias__));
1772 __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
1773 __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
1774 return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
1775 }
1776
1777 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1778 /// 32 bits of the vector are initialized with the single-precision
1779 /// floating-point value loaded from a specified memory location. The upper
1780 /// 96 bits are set to zero.
1781 ///
1782 /// \headerfile <x86intrin.h>
1783 ///
1784 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1785 ///
1786 /// \param __p
1787 /// A pointer to a 32-bit memory location containing a single-precision
1788 /// floating-point value.
1789 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1790 /// lower 32 bits contain the value loaded from the memory location. The
1791 /// upper 96 bits are set to zero.
1792 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ss(const float * __p)1793 _mm_load_ss(const float *__p)
1794 {
1795 struct __mm_load_ss_struct {
1796 float __u;
1797 } __attribute__((__packed__, __may_alias__));
1798 float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
1799 return __extension__ (__m128){ __u, 0, 0, 0 };
1800 }
1801
1802 /// Loads a 32-bit float value and duplicates it to all four vector
1803 /// elements of a 128-bit vector of [4 x float].
1804 ///
1805 /// \headerfile <x86intrin.h>
1806 ///
1807 /// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
1808 /// instruction.
1809 ///
1810 /// \param __p
1811 /// A pointer to a float value to be loaded and duplicated.
1812 /// \returns A 128-bit vector of [4 x float] containing the loaded and
1813 /// duplicated values.
1814 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load1_ps(const float * __p)1815 _mm_load1_ps(const float *__p)
1816 {
1817 struct __mm_load1_ps_struct {
1818 float __u;
1819 } __attribute__((__packed__, __may_alias__));
1820 float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
1821 return __extension__ (__m128){ __u, __u, __u, __u };
1822 }
1823
1824 #define _mm_load_ps1(p) _mm_load1_ps(p)
1825
1826 /// Loads a 128-bit floating-point vector of [4 x float] from an aligned
1827 /// memory location.
1828 ///
1829 /// \headerfile <x86intrin.h>
1830 ///
1831 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
1832 ///
1833 /// \param __p
1834 /// A pointer to a 128-bit memory location. The address of the memory
1835 /// location has to be 128-bit aligned.
1836 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1837 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_load_ps(const float * __p)1838 _mm_load_ps(const float *__p)
1839 {
1840 return *(const __m128*)__p;
1841 }
1842
1843 /// Loads a 128-bit floating-point vector of [4 x float] from an
1844 /// unaligned memory location.
1845 ///
1846 /// \headerfile <x86intrin.h>
1847 ///
1848 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
1849 ///
1850 /// \param __p
1851 /// A pointer to a 128-bit memory location. The address of the memory
1852 /// location does not have to be aligned.
1853 /// \returns A 128-bit vector of [4 x float] containing the loaded values.
1854 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadu_ps(const float * __p)1855 _mm_loadu_ps(const float *__p)
1856 {
1857 struct __loadu_ps {
1858 __m128_u __v;
1859 } __attribute__((__packed__, __may_alias__));
1860 return ((const struct __loadu_ps*)__p)->__v;
1861 }
1862
1863 /// Loads four packed float values, in reverse order, from an aligned
1864 /// memory location to 32-bit elements in a 128-bit vector of [4 x float].
1865 ///
1866 /// \headerfile <x86intrin.h>
1867 ///
1868 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
1869 /// instruction.
1870 ///
1871 /// \param __p
1872 /// A pointer to a 128-bit memory location. The address of the memory
1873 /// location has to be 128-bit aligned.
1874 /// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
1875 /// in reverse order.
1876 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_loadr_ps(const float * __p)1877 _mm_loadr_ps(const float *__p)
1878 {
1879 __m128 __a = _mm_load_ps(__p);
1880 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
1881 }
1882
1883 /// Create a 128-bit vector of [4 x float] with undefined values.
1884 ///
1885 /// \headerfile <x86intrin.h>
1886 ///
1887 /// This intrinsic has no corresponding instruction.
1888 ///
1889 /// \returns A 128-bit vector of [4 x float] containing undefined values.
1890 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_undefined_ps(void)1891 _mm_undefined_ps(void)
1892 {
1893 return (__m128)__builtin_ia32_undef128();
1894 }
1895
1896 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
1897 /// 32 bits of the vector are initialized with the specified single-precision
1898 /// floating-point value. The upper 96 bits are set to zero.
1899 ///
1900 /// \headerfile <x86intrin.h>
1901 ///
1902 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
1903 ///
1904 /// \param __w
1905 /// A single-precision floating-point value used to initialize the lower 32
1906 /// bits of the result.
1907 /// \returns An initialized 128-bit floating-point vector of [4 x float]. The
1908 /// lower 32 bits contain the value provided in the source operand. The
1909 /// upper 96 bits are set to zero.
1910 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ss(float __w)1911 _mm_set_ss(float __w)
1912 {
1913 return __extension__ (__m128){ __w, 0.0f, 0.0f, 0.0f };
1914 }
1915
1916 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1917 /// of the four single-precision floating-point vector elements set to the
1918 /// specified single-precision floating-point value.
1919 ///
1920 /// \headerfile <x86intrin.h>
1921 ///
1922 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1923 ///
1924 /// \param __w
1925 /// A single-precision floating-point value used to initialize each vector
1926 /// element of the result.
1927 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1928 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set1_ps(float __w)1929 _mm_set1_ps(float __w)
1930 {
1931 return __extension__ (__m128){ __w, __w, __w, __w };
1932 }
1933
1934 /* Microsoft specific. */
1935 /// Constructs a 128-bit floating-point vector of [4 x float], with each
1936 /// of the four single-precision floating-point vector elements set to the
1937 /// specified single-precision floating-point value.
1938 ///
1939 /// \headerfile <x86intrin.h>
1940 ///
1941 /// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
1942 ///
1943 /// \param __w
1944 /// A single-precision floating-point value used to initialize each vector
1945 /// element of the result.
1946 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1947 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps1(float __w)1948 _mm_set_ps1(float __w)
1949 {
1950 return _mm_set1_ps(__w);
1951 }
1952
1953 /// Constructs a 128-bit floating-point vector of [4 x float]
1954 /// initialized with the specified single-precision floating-point values.
1955 ///
1956 /// \headerfile <x86intrin.h>
1957 ///
1958 /// This intrinsic is a utility function and does not correspond to a specific
1959 /// instruction.
1960 ///
1961 /// \param __z
1962 /// A single-precision floating-point value used to initialize bits [127:96]
1963 /// of the result.
1964 /// \param __y
1965 /// A single-precision floating-point value used to initialize bits [95:64]
1966 /// of the result.
1967 /// \param __x
1968 /// A single-precision floating-point value used to initialize bits [63:32]
1969 /// of the result.
1970 /// \param __w
1971 /// A single-precision floating-point value used to initialize bits [31:0]
1972 /// of the result.
1973 /// \returns An initialized 128-bit floating-point vector of [4 x float].
1974 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_set_ps(float __z,float __y,float __x,float __w)1975 _mm_set_ps(float __z, float __y, float __x, float __w)
1976 {
1977 return __extension__ (__m128){ __w, __x, __y, __z };
1978 }
1979
1980 /// Constructs a 128-bit floating-point vector of [4 x float],
1981 /// initialized in reverse order with the specified 32-bit single-precision
1982 /// float-point values.
1983 ///
1984 /// \headerfile <x86intrin.h>
1985 ///
1986 /// This intrinsic is a utility function and does not correspond to a specific
1987 /// instruction.
1988 ///
1989 /// \param __z
1990 /// A single-precision floating-point value used to initialize bits [31:0]
1991 /// of the result.
1992 /// \param __y
1993 /// A single-precision floating-point value used to initialize bits [63:32]
1994 /// of the result.
1995 /// \param __x
1996 /// A single-precision floating-point value used to initialize bits [95:64]
1997 /// of the result.
1998 /// \param __w
1999 /// A single-precision floating-point value used to initialize bits [127:96]
2000 /// of the result.
2001 /// \returns An initialized 128-bit floating-point vector of [4 x float].
2002 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setr_ps(float __z,float __y,float __x,float __w)2003 _mm_setr_ps(float __z, float __y, float __x, float __w)
2004 {
2005 return __extension__ (__m128){ __z, __y, __x, __w };
2006 }
2007
2008 /// Constructs a 128-bit floating-point vector of [4 x float] initialized
2009 /// to zero.
2010 ///
2011 /// \headerfile <x86intrin.h>
2012 ///
2013 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
2014 ///
2015 /// \returns An initialized 128-bit floating-point vector of [4 x float] with
2016 /// all elements set to zero.
2017 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_setzero_ps(void)2018 _mm_setzero_ps(void)
2019 {
2020 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
2021 }
2022
2023 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
2024 /// memory location.
2025 ///
2026 /// \headerfile <x86intrin.h>
2027 ///
2028 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
2029 ///
2030 /// \param __p
2031 /// A pointer to a 64-bit memory location.
2032 /// \param __a
2033 /// A 128-bit vector of [4 x float] containing the values to be stored.
2034 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeh_pi(__m64 * __p,__m128 __a)2035 _mm_storeh_pi(__m64 *__p, __m128 __a)
2036 {
2037 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2038 struct __mm_storeh_pi_struct {
2039 __mm_storeh_pi_v2f32 __u;
2040 } __attribute__((__packed__, __may_alias__));
2041 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
2042 }
2043
2044 /// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
2045 /// memory location.
2046 ///
2047 /// \headerfile <x86intrin.h>
2048 ///
2049 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
2050 ///
2051 /// \param __p
2052 /// A pointer to a memory location that will receive the float values.
2053 /// \param __a
2054 /// A 128-bit vector of [4 x float] containing the values to be stored.
2055 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storel_pi(__m64 * __p,__m128 __a)2056 _mm_storel_pi(__m64 *__p, __m128 __a)
2057 {
2058 typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
2059 struct __mm_storeh_pi_struct {
2060 __mm_storeh_pi_v2f32 __u;
2061 } __attribute__((__packed__, __may_alias__));
2062 ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
2063 }
2064
2065 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
2066 /// memory location.
2067 ///
2068 /// \headerfile <x86intrin.h>
2069 ///
2070 /// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
2071 ///
2072 /// \param __p
2073 /// A pointer to a 32-bit memory location.
2074 /// \param __a
2075 /// A 128-bit vector of [4 x float] containing the value to be stored.
2076 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ss(float * __p,__m128 __a)2077 _mm_store_ss(float *__p, __m128 __a)
2078 {
2079 struct __mm_store_ss_struct {
2080 float __u;
2081 } __attribute__((__packed__, __may_alias__));
2082 ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
2083 }
2084
2085 /// Stores a 128-bit vector of [4 x float] to an unaligned memory
2086 /// location.
2087 ///
2088 /// \headerfile <x86intrin.h>
2089 ///
2090 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
2091 ///
2092 /// \param __p
2093 /// A pointer to a 128-bit memory location. The address of the memory
2094 /// location does not have to be aligned.
2095 /// \param __a
2096 /// A 128-bit vector of [4 x float] containing the values to be stored.
2097 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storeu_ps(float * __p,__m128 __a)2098 _mm_storeu_ps(float *__p, __m128 __a)
2099 {
2100 struct __storeu_ps {
2101 __m128_u __v;
2102 } __attribute__((__packed__, __may_alias__));
2103 ((struct __storeu_ps*)__p)->__v = __a;
2104 }
2105
2106 /// Stores a 128-bit vector of [4 x float] into an aligned memory
2107 /// location.
2108 ///
2109 /// \headerfile <x86intrin.h>
2110 ///
2111 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
2112 ///
2113 /// \param __p
2114 /// A pointer to a 128-bit memory location. The address of the memory
2115 /// location has to be 16-byte aligned.
2116 /// \param __a
2117 /// A 128-bit vector of [4 x float] containing the values to be stored.
2118 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps(float * __p,__m128 __a)2119 _mm_store_ps(float *__p, __m128 __a)
2120 {
2121 *(__m128*)__p = __a;
2122 }
2123
2124 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2125 /// four contiguous elements in an aligned memory location.
2126 ///
2127 /// \headerfile <x86intrin.h>
2128 ///
2129 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2130 /// instruction.
2131 ///
2132 /// \param __p
2133 /// A pointer to a 128-bit memory location.
2134 /// \param __a
2135 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2136 /// of the four contiguous elements pointed by \a __p.
2137 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store1_ps(float * __p,__m128 __a)2138 _mm_store1_ps(float *__p, __m128 __a)
2139 {
2140 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
2141 _mm_store_ps(__p, __a);
2142 }
2143
2144 /// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
2145 /// four contiguous elements in an aligned memory location.
2146 ///
2147 /// \headerfile <x86intrin.h>
2148 ///
2149 /// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
2150 /// instruction.
2151 ///
2152 /// \param __p
2153 /// A pointer to a 128-bit memory location.
2154 /// \param __a
2155 /// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
2156 /// of the four contiguous elements pointed by \a __p.
2157 static __inline__ void __DEFAULT_FN_ATTRS
_mm_store_ps1(float * __p,__m128 __a)2158 _mm_store_ps1(float *__p, __m128 __a)
2159 {
2160 _mm_store1_ps(__p, __a);
2161 }
2162
2163 /// Stores float values from a 128-bit vector of [4 x float] to an
2164 /// aligned memory location in reverse order.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
2169 /// instruction.
2170 ///
2171 /// \param __p
2172 /// A pointer to a 128-bit memory location. The address of the memory
2173 /// location has to be 128-bit aligned.
2174 /// \param __a
2175 /// A 128-bit vector of [4 x float] containing the values to be stored.
2176 static __inline__ void __DEFAULT_FN_ATTRS
_mm_storer_ps(float * __p,__m128 __a)2177 _mm_storer_ps(float *__p, __m128 __a)
2178 {
2179 __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
2180 _mm_store_ps(__p, __a);
2181 }
2182
2183 #define _MM_HINT_ET0 7
2184 #define _MM_HINT_ET1 6
2185 #define _MM_HINT_T0 3
2186 #define _MM_HINT_T1 2
2187 #define _MM_HINT_T2 1
2188 #define _MM_HINT_NTA 0
2189
2190 #ifndef _MSC_VER
2191 /* FIXME: We have to #define this because "sel" must be a constant integer, and
2192 Sema doesn't do any form of constant propagation yet. */
2193
2194 /// Loads one cache line of data from the specified address to a location
2195 /// closer to the processor.
2196 ///
2197 /// \headerfile <x86intrin.h>
2198 ///
2199 /// \code
2200 /// void _mm_prefetch(const void *a, const int sel);
2201 /// \endcode
2202 ///
2203 /// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
2204 ///
2205 /// \param a
2206 /// A pointer to a memory location containing a cache line of data.
2207 /// \param sel
2208 /// A predefined integer constant specifying the type of prefetch
2209 /// operation: \n
2210 /// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
2211 /// PREFETCHNTA instruction will be generated. \n
2212 /// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
2213 /// be generated. \n
2214 /// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
2215 /// be generated. \n
2216 /// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
2217 /// be generated.
2218 #define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
2219 ((sel) >> 2) & 1, (sel) & 0x3))
2220 #endif
2221
2222 /// Stores a 64-bit integer in the specified aligned memory location. To
2223 /// minimize caching, the data is flagged as non-temporal (unlikely to be
2224 /// used again soon).
2225 ///
2226 /// \headerfile <x86intrin.h>
2227 ///
2228 /// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
2229 ///
2230 /// \param __p
2231 /// A pointer to an aligned memory location used to store the register value.
2232 /// \param __a
2233 /// A 64-bit integer containing the value to be stored.
2234 static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_stream_pi(void * __p,__m64 __a)2235 _mm_stream_pi(void *__p, __m64 __a)
2236 {
2237 __builtin_ia32_movntq((__m64 *)__p, __a);
2238 }
2239
2240 /// Moves packed float values from a 128-bit vector of [4 x float] to a
2241 /// 128-bit aligned memory location. To minimize caching, the data is flagged
2242 /// as non-temporal (unlikely to be used again soon).
2243 ///
2244 /// \headerfile <x86intrin.h>
2245 ///
2246 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
2247 ///
2248 /// \param __p
2249 /// A pointer to a 128-bit aligned memory location that will receive the
2250 /// single-precision floating-point values.
2251 /// \param __a
2252 /// A 128-bit vector of [4 x float] containing the values to be moved.
2253 static __inline__ void __DEFAULT_FN_ATTRS
_mm_stream_ps(void * __p,__m128 __a)2254 _mm_stream_ps(void *__p, __m128 __a)
2255 {
2256 __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
2257 }
2258
2259 #if defined(__cplusplus)
2260 extern "C" {
2261 #endif
2262
2263 /// Forces strong memory ordering (serialization) between store
2264 /// instructions preceding this instruction and store instructions following
2265 /// this instruction, ensuring the system completes all previous stores
2266 /// before executing subsequent stores.
2267 ///
2268 /// \headerfile <x86intrin.h>
2269 ///
2270 /// This intrinsic corresponds to the <c> SFENCE </c> instruction.
2271 ///
2272 void _mm_sfence(void);
2273
2274 #if defined(__cplusplus)
2275 } // extern "C"
2276 #endif
2277
2278 /// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
2279 /// returns it, as specified by the immediate integer operand.
2280 ///
2281 /// \headerfile <x86intrin.h>
2282 ///
2283 /// \code
2284 /// int _mm_extract_pi16(__m64 a, int n);
2285 /// \endcode
2286 ///
2287 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
2288 ///
2289 /// \param a
2290 /// A 64-bit vector of [4 x i16].
2291 /// \param n
2292 /// An immediate integer operand that determines which bits are extracted: \n
2293 /// 0: Bits [15:0] are copied to the destination. \n
2294 /// 1: Bits [31:16] are copied to the destination. \n
2295 /// 2: Bits [47:32] are copied to the destination. \n
2296 /// 3: Bits [63:48] are copied to the destination.
2297 /// \returns A 16-bit integer containing the extracted 16 bits of packed data.
2298 #define _mm_extract_pi16(a, n) \
2299 ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
2300
2301 /// Copies data from the 64-bit vector of [4 x i16] to the destination,
2302 /// and inserts the lower 16-bits of an integer operand at the 16-bit offset
2303 /// specified by the immediate operand \a n.
2304 ///
2305 /// \headerfile <x86intrin.h>
2306 ///
2307 /// \code
2308 /// __m64 _mm_insert_pi16(__m64 a, int d, int n);
2309 /// \endcode
2310 ///
2311 /// This intrinsic corresponds to the <c> PINSRW </c> instruction.
2312 ///
2313 /// \param a
2314 /// A 64-bit vector of [4 x i16].
2315 /// \param d
2316 /// An integer. The lower 16-bit value from this operand is written to the
2317 /// destination at the offset specified by operand \a n.
2318 /// \param n
2319 /// An immediate integer operant that determines which the bits to be used
2320 /// in the destination. \n
2321 /// 0: Bits [15:0] are copied to the destination. \n
2322 /// 1: Bits [31:16] are copied to the destination. \n
2323 /// 2: Bits [47:32] are copied to the destination. \n
2324 /// 3: Bits [63:48] are copied to the destination. \n
2325 /// The remaining bits in the destination are copied from the corresponding
2326 /// bits in operand \a a.
2327 /// \returns A 64-bit integer vector containing the copied packed data from the
2328 /// operands.
2329 #define _mm_insert_pi16(a, d, n) \
2330 ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
2331
2332 /// Compares each of the corresponding packed 16-bit integer values of
2333 /// the 64-bit integer vectors, and writes the greater value to the
2334 /// corresponding bits in the destination.
2335 ///
2336 /// \headerfile <x86intrin.h>
2337 ///
2338 /// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
2339 ///
2340 /// \param __a
2341 /// A 64-bit integer vector containing one of the source operands.
2342 /// \param __b
2343 /// A 64-bit integer vector containing one of the source operands.
2344 /// \returns A 64-bit integer vector containing the comparison results.
2345 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_max_pi16(__m64 __a,__m64 __b)2346 _mm_max_pi16(__m64 __a, __m64 __b)
2347 {
2348 return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
2349 }
2350
2351 /// Compares each of the corresponding packed 8-bit unsigned integer
2352 /// values of the 64-bit integer vectors, and writes the greater value to the
2353 /// corresponding bits in the destination.
2354 ///
2355 /// \headerfile <x86intrin.h>
2356 ///
2357 /// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
2358 ///
2359 /// \param __a
2360 /// A 64-bit integer vector containing one of the source operands.
2361 /// \param __b
2362 /// A 64-bit integer vector containing one of the source operands.
2363 /// \returns A 64-bit integer vector containing the comparison results.
2364 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_max_pu8(__m64 __a,__m64 __b)2365 _mm_max_pu8(__m64 __a, __m64 __b)
2366 {
2367 return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
2368 }
2369
2370 /// Compares each of the corresponding packed 16-bit integer values of
2371 /// the 64-bit integer vectors, and writes the lesser value to the
2372 /// corresponding bits in the destination.
2373 ///
2374 /// \headerfile <x86intrin.h>
2375 ///
2376 /// This intrinsic corresponds to the <c> PMINSW </c> instruction.
2377 ///
2378 /// \param __a
2379 /// A 64-bit integer vector containing one of the source operands.
2380 /// \param __b
2381 /// A 64-bit integer vector containing one of the source operands.
2382 /// \returns A 64-bit integer vector containing the comparison results.
2383 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_min_pi16(__m64 __a,__m64 __b)2384 _mm_min_pi16(__m64 __a, __m64 __b)
2385 {
2386 return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
2387 }
2388
2389 /// Compares each of the corresponding packed 8-bit unsigned integer
2390 /// values of the 64-bit integer vectors, and writes the lesser value to the
2391 /// corresponding bits in the destination.
2392 ///
2393 /// \headerfile <x86intrin.h>
2394 ///
2395 /// This intrinsic corresponds to the <c> PMINUB </c> instruction.
2396 ///
2397 /// \param __a
2398 /// A 64-bit integer vector containing one of the source operands.
2399 /// \param __b
2400 /// A 64-bit integer vector containing one of the source operands.
2401 /// \returns A 64-bit integer vector containing the comparison results.
2402 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_min_pu8(__m64 __a,__m64 __b)2403 _mm_min_pu8(__m64 __a, __m64 __b)
2404 {
2405 return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
2406 }
2407
2408 /// Takes the most significant bit from each 8-bit element in a 64-bit
2409 /// integer vector to create an 8-bit mask value. Zero-extends the value to
2410 /// 32-bit integer and writes it to the destination.
2411 ///
2412 /// \headerfile <x86intrin.h>
2413 ///
2414 /// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
2415 ///
2416 /// \param __a
2417 /// A 64-bit integer vector containing the values with bits to be extracted.
2418 /// \returns The most significant bit from each 8-bit element in \a __a,
2419 /// written to bits [7:0].
2420 static __inline__ int __DEFAULT_FN_ATTRS_MMX
_mm_movemask_pi8(__m64 __a)2421 _mm_movemask_pi8(__m64 __a)
2422 {
2423 return __builtin_ia32_pmovmskb((__v8qi)__a);
2424 }
2425
2426 /// Multiplies packed 16-bit unsigned integer values and writes the
2427 /// high-order 16 bits of each 32-bit product to the corresponding bits in
2428 /// the destination.
2429 ///
2430 /// \headerfile <x86intrin.h>
2431 ///
2432 /// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
2433 ///
2434 /// \param __a
2435 /// A 64-bit integer vector containing one of the source operands.
2436 /// \param __b
2437 /// A 64-bit integer vector containing one of the source operands.
2438 /// \returns A 64-bit integer vector containing the products of both operands.
2439 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_mulhi_pu16(__m64 __a,__m64 __b)2440 _mm_mulhi_pu16(__m64 __a, __m64 __b)
2441 {
2442 return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
2443 }
2444
2445 /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
2446 /// destination, as specified by the immediate value operand.
2447 ///
2448 /// \headerfile <x86intrin.h>
2449 ///
2450 /// \code
2451 /// __m64 _mm_shuffle_pi16(__m64 a, const int n);
2452 /// \endcode
2453 ///
2454 /// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
2455 ///
2456 /// \param a
2457 /// A 64-bit integer vector containing the values to be shuffled.
2458 /// \param n
2459 /// An immediate value containing an 8-bit value specifying which elements to
2460 /// copy from \a a. The destinations within the 64-bit destination are
2461 /// assigned values as follows: \n
2462 /// Bits [1:0] are used to assign values to bits [15:0] in the
2463 /// destination. \n
2464 /// Bits [3:2] are used to assign values to bits [31:16] in the
2465 /// destination. \n
2466 /// Bits [5:4] are used to assign values to bits [47:32] in the
2467 /// destination. \n
2468 /// Bits [7:6] are used to assign values to bits [63:48] in the
2469 /// destination. \n
2470 /// Bit value assignments: \n
2471 /// 00: assigned from bits [15:0] of \a a. \n
2472 /// 01: assigned from bits [31:16] of \a a. \n
2473 /// 10: assigned from bits [47:32] of \a a. \n
2474 /// 11: assigned from bits [63:48] of \a a. \n
2475 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2476 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2477 /// <c>[b6, b4, b2, b0]</c>.
2478 /// \returns A 64-bit integer vector containing the shuffled values.
2479 #define _mm_shuffle_pi16(a, n) \
2480 ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
2481
2482 /// Conditionally copies the values from each 8-bit element in the first
2483 /// 64-bit integer vector operand to the specified memory location, as
2484 /// specified by the most significant bit in the corresponding element in the
2485 /// second 64-bit integer vector operand.
2486 ///
2487 /// To minimize caching, the data is flagged as non-temporal
2488 /// (unlikely to be used again soon).
2489 ///
2490 /// \headerfile <x86intrin.h>
2491 ///
2492 /// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
2493 ///
2494 /// \param __d
2495 /// A 64-bit integer vector containing the values with elements to be copied.
2496 /// \param __n
2497 /// A 64-bit integer vector operand. The most significant bit from each 8-bit
2498 /// element determines whether the corresponding element in operand \a __d
2499 /// is copied. If the most significant bit of a given element is 1, the
2500 /// corresponding element in operand \a __d is copied.
2501 /// \param __p
2502 /// A pointer to a 64-bit memory location that will receive the conditionally
2503 /// copied integer values. The address of the memory location does not have
2504 /// to be aligned.
2505 static __inline__ void __DEFAULT_FN_ATTRS_MMX
_mm_maskmove_si64(__m64 __d,__m64 __n,char * __p)2506 _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
2507 {
2508 __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
2509 }
2510
2511 /// Computes the rounded averages of the packed unsigned 8-bit integer
2512 /// values and writes the averages to the corresponding bits in the
2513 /// destination.
2514 ///
2515 /// \headerfile <x86intrin.h>
2516 ///
2517 /// This intrinsic corresponds to the <c> PAVGB </c> instruction.
2518 ///
2519 /// \param __a
2520 /// A 64-bit integer vector containing one of the source operands.
2521 /// \param __b
2522 /// A 64-bit integer vector containing one of the source operands.
2523 /// \returns A 64-bit integer vector containing the averages of both operands.
2524 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_avg_pu8(__m64 __a,__m64 __b)2525 _mm_avg_pu8(__m64 __a, __m64 __b)
2526 {
2527 return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
2528 }
2529
2530 /// Computes the rounded averages of the packed unsigned 16-bit integer
2531 /// values and writes the averages to the corresponding bits in the
2532 /// destination.
2533 ///
2534 /// \headerfile <x86intrin.h>
2535 ///
2536 /// This intrinsic corresponds to the <c> PAVGW </c> instruction.
2537 ///
2538 /// \param __a
2539 /// A 64-bit integer vector containing one of the source operands.
2540 /// \param __b
2541 /// A 64-bit integer vector containing one of the source operands.
2542 /// \returns A 64-bit integer vector containing the averages of both operands.
2543 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_avg_pu16(__m64 __a,__m64 __b)2544 _mm_avg_pu16(__m64 __a, __m64 __b)
2545 {
2546 return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
2547 }
2548
2549 /// Subtracts the corresponding 8-bit unsigned integer values of the two
2550 /// 64-bit vector operands and computes the absolute value for each of the
2551 /// difference. Then sum of the 8 absolute differences is written to the
2552 /// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
2553 ///
2554 /// \headerfile <x86intrin.h>
2555 ///
2556 /// This intrinsic corresponds to the <c> PSADBW </c> instruction.
2557 ///
2558 /// \param __a
2559 /// A 64-bit integer vector containing one of the source operands.
2560 /// \param __b
2561 /// A 64-bit integer vector containing one of the source operands.
2562 /// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
2563 /// sets of absolute differences between both operands. The upper bits are
2564 /// cleared.
2565 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_sad_pu8(__m64 __a,__m64 __b)2566 _mm_sad_pu8(__m64 __a, __m64 __b)
2567 {
2568 return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
2569 }
2570
2571 #if defined(__cplusplus)
2572 extern "C" {
2573 #endif
2574
2575 /// Returns the contents of the MXCSR register as a 32-bit unsigned
2576 /// integer value.
2577 ///
2578 /// There are several groups of macros associated with this
2579 /// intrinsic, including:
2580 /// <ul>
2581 /// <li>
2582 /// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2583 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2584 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2585 /// _MM_GET_EXCEPTION_STATE().
2586 /// </li>
2587 /// <li>
2588 /// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2589 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2590 /// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
2591 /// </li>
2592 /// <li>
2593 /// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2594 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2595 /// _MM_GET_ROUNDING_MODE().
2596 /// </li>
2597 /// <li>
2598 /// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2599 /// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
2600 /// </li>
2601 /// <li>
2602 /// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2603 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2604 /// _MM_GET_DENORMALS_ZERO_MODE().
2605 /// </li>
2606 /// </ul>
2607 ///
2608 /// For example, the following expression checks if an overflow exception has
2609 /// occurred:
2610 /// \code
2611 /// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
2612 /// \endcode
2613 ///
2614 /// The following expression gets the current rounding mode:
2615 /// \code
2616 /// _MM_GET_ROUNDING_MODE()
2617 /// \endcode
2618 ///
2619 /// \headerfile <x86intrin.h>
2620 ///
2621 /// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
2622 ///
2623 /// \returns A 32-bit unsigned integer containing the contents of the MXCSR
2624 /// register.
2625 unsigned int _mm_getcsr(void);
2626
2627 /// Sets the MXCSR register with the 32-bit unsigned integer value.
2628 ///
2629 /// There are several groups of macros associated with this intrinsic,
2630 /// including:
2631 /// <ul>
2632 /// <li>
2633 /// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
2634 /// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
2635 /// _MM_EXCEPT_INEXACT. There is a convenience wrapper
2636 /// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
2637 /// </li>
2638 /// <li>
2639 /// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
2640 /// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
2641 /// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
2642 /// of these macros.
2643 /// </li>
2644 /// <li>
2645 /// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
2646 /// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
2647 /// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
2648 /// </li>
2649 /// <li>
2650 /// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
2651 /// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
2652 /// one of these macros.
2653 /// </li>
2654 /// <li>
2655 /// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
2656 /// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
2657 /// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
2658 /// </li>
2659 /// </ul>
2660 ///
2661 /// For example, the following expression causes subsequent floating-point
2662 /// operations to round up:
2663 /// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
2664 ///
2665 /// The following example sets the DAZ and FTZ flags:
2666 /// \code
2667 /// void setFlags() {
2668 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
2669 /// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
2670 /// }
2671 /// \endcode
2672 ///
2673 /// \headerfile <x86intrin.h>
2674 ///
2675 /// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
2676 ///
2677 /// \param __i
2678 /// A 32-bit unsigned integer value to be written to the MXCSR register.
2679 void _mm_setcsr(unsigned int __i);
2680
2681 #if defined(__cplusplus)
2682 } // extern "C"
2683 #endif
2684
2685 /// Selects 4 float values from the 128-bit operands of [4 x float], as
2686 /// specified by the immediate value operand.
2687 ///
2688 /// \headerfile <x86intrin.h>
2689 ///
2690 /// \code
2691 /// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
2692 /// \endcode
2693 ///
2694 /// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
2695 ///
2696 /// \param a
2697 /// A 128-bit vector of [4 x float].
2698 /// \param b
2699 /// A 128-bit vector of [4 x float].
2700 /// \param mask
2701 /// An immediate value containing an 8-bit value specifying which elements to
2702 /// copy from \a a and \a b. \n
2703 /// Bits [3:0] specify the values copied from operand \a a. \n
2704 /// Bits [7:4] specify the values copied from operand \a b. \n
2705 /// The destinations within the 128-bit destination are assigned values as
2706 /// follows: \n
2707 /// Bits [1:0] are used to assign values to bits [31:0] in the
2708 /// destination. \n
2709 /// Bits [3:2] are used to assign values to bits [63:32] in the
2710 /// destination. \n
2711 /// Bits [5:4] are used to assign values to bits [95:64] in the
2712 /// destination. \n
2713 /// Bits [7:6] are used to assign values to bits [127:96] in the
2714 /// destination. \n
2715 /// Bit value assignments: \n
2716 /// 00: Bits [31:0] copied from the specified operand. \n
2717 /// 01: Bits [63:32] copied from the specified operand. \n
2718 /// 10: Bits [95:64] copied from the specified operand. \n
2719 /// 11: Bits [127:96] copied from the specified operand. \n
2720 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
2721 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
2722 /// <c>[b6, b4, b2, b0]</c>.
2723 /// \returns A 128-bit vector of [4 x float] containing the shuffled values.
2724 #define _mm_shuffle_ps(a, b, mask) \
2725 ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
2726 (int)(mask)))
2727
2728 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
2729 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2730 ///
2731 /// \headerfile <x86intrin.h>
2732 ///
2733 /// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
2734 ///
2735 /// \param __a
2736 /// A 128-bit vector of [4 x float]. \n
2737 /// Bits [95:64] are written to bits [31:0] of the destination. \n
2738 /// Bits [127:96] are written to bits [95:64] of the destination.
2739 /// \param __b
2740 /// A 128-bit vector of [4 x float].
2741 /// Bits [95:64] are written to bits [63:32] of the destination. \n
2742 /// Bits [127:96] are written to bits [127:96] of the destination.
2743 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2744 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpackhi_ps(__m128 __a,__m128 __b)2745 _mm_unpackhi_ps(__m128 __a, __m128 __b)
2746 {
2747 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
2748 }
2749
2750 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
2751 /// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
2752 ///
2753 /// \headerfile <x86intrin.h>
2754 ///
2755 /// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
2756 ///
2757 /// \param __a
2758 /// A 128-bit vector of [4 x float]. \n
2759 /// Bits [31:0] are written to bits [31:0] of the destination. \n
2760 /// Bits [63:32] are written to bits [95:64] of the destination.
2761 /// \param __b
2762 /// A 128-bit vector of [4 x float]. \n
2763 /// Bits [31:0] are written to bits [63:32] of the destination. \n
2764 /// Bits [63:32] are written to bits [127:96] of the destination.
2765 /// \returns A 128-bit vector of [4 x float] containing the interleaved values.
2766 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_unpacklo_ps(__m128 __a,__m128 __b)2767 _mm_unpacklo_ps(__m128 __a, __m128 __b)
2768 {
2769 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
2770 }
2771
2772 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2773 /// 32 bits are set to the lower 32 bits of the second parameter. The upper
2774 /// 96 bits are set to the upper 96 bits of the first parameter.
2775 ///
2776 /// \headerfile <x86intrin.h>
2777 ///
2778 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
2779 /// instruction.
2780 ///
2781 /// \param __a
2782 /// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
2783 /// written to the upper 96 bits of the result.
2784 /// \param __b
2785 /// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
2786 /// written to the lower 32 bits of the result.
2787 /// \returns A 128-bit floating-point vector of [4 x float].
2788 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_move_ss(__m128 __a,__m128 __b)2789 _mm_move_ss(__m128 __a, __m128 __b)
2790 {
2791 __a[0] = __b[0];
2792 return __a;
2793 }
2794
2795 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2796 /// 64 bits are set to the upper 64 bits of the second parameter. The upper
2797 /// 64 bits are set to the upper 64 bits of the first parameter.
2798 ///
2799 /// \headerfile <x86intrin.h>
2800 ///
2801 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
2802 ///
2803 /// \param __a
2804 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2805 /// written to the upper 64 bits of the result.
2806 /// \param __b
2807 /// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
2808 /// written to the lower 64 bits of the result.
2809 /// \returns A 128-bit floating-point vector of [4 x float].
2810 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movehl_ps(__m128 __a,__m128 __b)2811 _mm_movehl_ps(__m128 __a, __m128 __b)
2812 {
2813 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
2814 }
2815
2816 /// Constructs a 128-bit floating-point vector of [4 x float]. The lower
2817 /// 64 bits are set to the lower 64 bits of the first parameter. The upper
2818 /// 64 bits are set to the lower 64 bits of the second parameter.
2819 ///
2820 /// \headerfile <x86intrin.h>
2821 ///
2822 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
2823 ///
2824 /// \param __a
2825 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2826 /// written to the lower 64 bits of the result.
2827 /// \param __b
2828 /// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
2829 /// written to the upper 64 bits of the result.
2830 /// \returns A 128-bit floating-point vector of [4 x float].
2831 static __inline__ __m128 __DEFAULT_FN_ATTRS
_mm_movelh_ps(__m128 __a,__m128 __b)2832 _mm_movelh_ps(__m128 __a, __m128 __b)
2833 {
2834 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
2835 }
2836
2837 /// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
2838 /// float].
2839 ///
2840 /// \headerfile <x86intrin.h>
2841 ///
2842 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2843 ///
2844 /// \param __a
2845 /// A 64-bit vector of [4 x i16]. The elements of the destination are copied
2846 /// from the corresponding elements in this operand.
2847 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2848 /// values from the operand.
2849 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi16_ps(__m64 __a)2850 _mm_cvtpi16_ps(__m64 __a)
2851 {
2852 __m64 __b, __c;
2853 __m128 __r;
2854
2855 __b = _mm_setzero_si64();
2856 __b = _mm_cmpgt_pi16(__b, __a);
2857 __c = _mm_unpackhi_pi16(__a, __b);
2858 __r = _mm_setzero_ps();
2859 __r = _mm_cvtpi32_ps(__r, __c);
2860 __r = _mm_movelh_ps(__r, __r);
2861 __c = _mm_unpacklo_pi16(__a, __b);
2862 __r = _mm_cvtpi32_ps(__r, __c);
2863
2864 return __r;
2865 }
2866
2867 /// Converts a 64-bit vector of 16-bit unsigned integer values into a
2868 /// 128-bit vector of [4 x float].
2869 ///
2870 /// \headerfile <x86intrin.h>
2871 ///
2872 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2873 ///
2874 /// \param __a
2875 /// A 64-bit vector of 16-bit unsigned integer values. The elements of the
2876 /// destination are copied from the corresponding elements in this operand.
2877 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2878 /// values from the operand.
2879 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpu16_ps(__m64 __a)2880 _mm_cvtpu16_ps(__m64 __a)
2881 {
2882 __m64 __b, __c;
2883 __m128 __r;
2884
2885 __b = _mm_setzero_si64();
2886 __c = _mm_unpackhi_pi16(__a, __b);
2887 __r = _mm_setzero_ps();
2888 __r = _mm_cvtpi32_ps(__r, __c);
2889 __r = _mm_movelh_ps(__r, __r);
2890 __c = _mm_unpacklo_pi16(__a, __b);
2891 __r = _mm_cvtpi32_ps(__r, __c);
2892
2893 return __r;
2894 }
2895
2896 /// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
2897 /// into a 128-bit vector of [4 x float].
2898 ///
2899 /// \headerfile <x86intrin.h>
2900 ///
2901 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2902 ///
2903 /// \param __a
2904 /// A 64-bit vector of [8 x i8]. The elements of the destination are copied
2905 /// from the corresponding lower 4 elements in this operand.
2906 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2907 /// values from the operand.
2908 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi8_ps(__m64 __a)2909 _mm_cvtpi8_ps(__m64 __a)
2910 {
2911 __m64 __b;
2912
2913 __b = _mm_setzero_si64();
2914 __b = _mm_cmpgt_pi8(__b, __a);
2915 __b = _mm_unpacklo_pi8(__a, __b);
2916
2917 return _mm_cvtpi16_ps(__b);
2918 }
2919
2920 /// Converts the lower four unsigned 8-bit integer values from a 64-bit
2921 /// vector of [8 x u8] into a 128-bit vector of [4 x float].
2922 ///
2923 /// \headerfile <x86intrin.h>
2924 ///
2925 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2926 ///
2927 /// \param __a
2928 /// A 64-bit vector of unsigned 8-bit integer values. The elements of the
2929 /// destination are copied from the corresponding lower 4 elements in this
2930 /// operand.
2931 /// \returns A 128-bit vector of [4 x float] containing the copied and converted
2932 /// values from the source operand.
2933 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpu8_ps(__m64 __a)2934 _mm_cvtpu8_ps(__m64 __a)
2935 {
2936 __m64 __b;
2937
2938 __b = _mm_setzero_si64();
2939 __b = _mm_unpacklo_pi8(__a, __b);
2940
2941 return _mm_cvtpi16_ps(__b);
2942 }
2943
2944 /// Converts the two 32-bit signed integer values from each 64-bit vector
2945 /// operand of [2 x i32] into a 128-bit vector of [4 x float].
2946 ///
2947 /// \headerfile <x86intrin.h>
2948 ///
2949 /// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
2950 ///
2951 /// \param __a
2952 /// A 64-bit vector of [2 x i32]. The lower elements of the destination are
2953 /// copied from the elements in this operand.
2954 /// \param __b
2955 /// A 64-bit vector of [2 x i32]. The upper elements of the destination are
2956 /// copied from the elements in this operand.
2957 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
2958 /// copied and converted values from the first operand. The upper 64 bits
2959 /// contain the copied and converted values from the second operand.
2960 static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
_mm_cvtpi32x2_ps(__m64 __a,__m64 __b)2961 _mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
2962 {
2963 __m128 __c;
2964
2965 __c = _mm_setzero_ps();
2966 __c = _mm_cvtpi32_ps(__c, __b);
2967 __c = _mm_movelh_ps(__c, __c);
2968
2969 return _mm_cvtpi32_ps(__c, __a);
2970 }
2971
2972 /// Converts each single-precision floating-point element of a 128-bit
2973 /// floating-point vector of [4 x float] into a 16-bit signed integer, and
2974 /// packs the results into a 64-bit integer vector of [4 x i16].
2975 ///
2976 /// If the floating-point element is NaN or infinity, or if the
2977 /// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
2978 /// it is converted to 0x8000. Otherwise if the floating-point element is
2979 /// greater than 0x7FFF, it is converted to 0x7FFF.
2980 ///
2981 /// \headerfile <x86intrin.h>
2982 ///
2983 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
2984 ///
2985 /// \param __a
2986 /// A 128-bit floating-point vector of [4 x float].
2987 /// \returns A 64-bit integer vector of [4 x i16] containing the converted
2988 /// values.
2989 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi16(__m128 __a)2990 _mm_cvtps_pi16(__m128 __a)
2991 {
2992 __m64 __b, __c;
2993
2994 __b = _mm_cvtps_pi32(__a);
2995 __a = _mm_movehl_ps(__a, __a);
2996 __c = _mm_cvtps_pi32(__a);
2997
2998 return _mm_packs_pi32(__b, __c);
2999 }
3000
3001 /// Converts each single-precision floating-point element of a 128-bit
3002 /// floating-point vector of [4 x float] into an 8-bit signed integer, and
3003 /// packs the results into the lower 32 bits of a 64-bit integer vector of
3004 /// [8 x i8]. The upper 32 bits of the vector are set to 0.
3005 ///
3006 /// If the floating-point element is NaN or infinity, or if the
3007 /// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
3008 /// is converted to 0x80. Otherwise if the floating-point element is greater
3009 /// than 0x7F, it is converted to 0x7F.
3010 ///
3011 /// \headerfile <x86intrin.h>
3012 ///
3013 /// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
3014 ///
3015 /// \param __a
3016 /// 128-bit floating-point vector of [4 x float].
3017 /// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
3018 /// converted values and the uppper 32 bits are set to zero.
3019 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
_mm_cvtps_pi8(__m128 __a)3020 _mm_cvtps_pi8(__m128 __a)
3021 {
3022 __m64 __b, __c;
3023
3024 __b = _mm_cvtps_pi16(__a);
3025 __c = _mm_setzero_si64();
3026
3027 return _mm_packs_pi16(__b, __c);
3028 }
3029
3030 /// Extracts the sign bits from each single-precision floating-point
3031 /// element of a 128-bit floating-point vector of [4 x float] and returns the
3032 /// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
3033 /// to zero.
3034 ///
3035 /// \headerfile <x86intrin.h>
3036 ///
3037 /// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
3038 ///
3039 /// \param __a
3040 /// A 128-bit floating-point vector of [4 x float].
3041 /// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
3042 /// single-precision floating-point element of the parameter. Bits [31:4] are
3043 /// set to zero.
3044 static __inline__ int __DEFAULT_FN_ATTRS
_mm_movemask_ps(__m128 __a)3045 _mm_movemask_ps(__m128 __a)
3046 {
3047 return __builtin_ia32_movmskps((__v4sf)__a);
3048 }
3049
3050 /* Compare */
3051 #define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
3052 #define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
3053 #define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
3054 #define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
3055 #define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
3056 #define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
3057 #define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
3058 #define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
3059
3060 /// Compares each of the corresponding values of two 128-bit vectors of
3061 /// [4 x float], using the operation specified by the immediate integer
3062 /// operand.
3063 ///
3064 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3065 /// If either value in a comparison is NaN, comparisons that are ordered
3066 /// return false, and comparisons that are unordered return true.
3067 ///
3068 /// \headerfile <x86intrin.h>
3069 ///
3070 /// \code
3071 /// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
3072 /// \endcode
3073 ///
3074 /// This intrinsic corresponds to the <c> (V)CMPPS </c> instruction.
3075 ///
3076 /// \param a
3077 /// A 128-bit vector of [4 x float].
3078 /// \param b
3079 /// A 128-bit vector of [4 x float].
3080 /// \param c
3081 /// An immediate integer operand, with bits [4:0] specifying which comparison
3082 /// operation to use: \n
3083 /// 0x00: Equal (ordered, non-signaling) \n
3084 /// 0x01: Less-than (ordered, signaling) \n
3085 /// 0x02: Less-than-or-equal (ordered, signaling) \n
3086 /// 0x03: Unordered (non-signaling) \n
3087 /// 0x04: Not-equal (unordered, non-signaling) \n
3088 /// 0x05: Not-less-than (unordered, signaling) \n
3089 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3090 /// 0x07: Ordered (non-signaling) \n
3091 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3092 #define _mm_cmp_ps(a, b, c) \
3093 ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3094
3095 /// Compares each of the corresponding scalar values of two 128-bit
3096 /// vectors of [4 x float], using the operation specified by the immediate
3097 /// integer operand.
3098 ///
3099 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3100 /// If either value in a comparison is NaN, comparisons that are ordered
3101 /// return false, and comparisons that are unordered return true.
3102 ///
3103 /// \headerfile <x86intrin.h>
3104 ///
3105 /// \code
3106 /// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
3107 /// \endcode
3108 ///
3109 /// This intrinsic corresponds to the <c> (V)CMPSS </c> instruction.
3110 ///
3111 /// \param a
3112 /// A 128-bit vector of [4 x float].
3113 /// \param b
3114 /// A 128-bit vector of [4 x float].
3115 /// \param c
3116 /// An immediate integer operand, with bits [4:0] specifying which comparison
3117 /// operation to use: \n
3118 /// 0x00: Equal (ordered, non-signaling) \n
3119 /// 0x01: Less-than (ordered, signaling) \n
3120 /// 0x02: Less-than-or-equal (ordered, signaling) \n
3121 /// 0x03: Unordered (non-signaling) \n
3122 /// 0x04: Not-equal (unordered, non-signaling) \n
3123 /// 0x05: Not-less-than (unordered, signaling) \n
3124 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
3125 /// 0x07: Ordered (non-signaling) \n
3126 /// \returns A 128-bit vector of [4 x float] containing the comparison results.
3127 #define _mm_cmp_ss(a, b, c) \
3128 ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), (c)))
3129
3130 #define _MM_ALIGN16 __attribute__((aligned(16)))
3131
3132 #define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
3133
3134 #define _MM_EXCEPT_INVALID (0x0001U)
3135 #define _MM_EXCEPT_DENORM (0x0002U)
3136 #define _MM_EXCEPT_DIV_ZERO (0x0004U)
3137 #define _MM_EXCEPT_OVERFLOW (0x0008U)
3138 #define _MM_EXCEPT_UNDERFLOW (0x0010U)
3139 #define _MM_EXCEPT_INEXACT (0x0020U)
3140 #define _MM_EXCEPT_MASK (0x003fU)
3141
3142 #define _MM_MASK_INVALID (0x0080U)
3143 #define _MM_MASK_DENORM (0x0100U)
3144 #define _MM_MASK_DIV_ZERO (0x0200U)
3145 #define _MM_MASK_OVERFLOW (0x0400U)
3146 #define _MM_MASK_UNDERFLOW (0x0800U)
3147 #define _MM_MASK_INEXACT (0x1000U)
3148 #define _MM_MASK_MASK (0x1f80U)
3149
3150 #define _MM_ROUND_NEAREST (0x0000U)
3151 #define _MM_ROUND_DOWN (0x2000U)
3152 #define _MM_ROUND_UP (0x4000U)
3153 #define _MM_ROUND_TOWARD_ZERO (0x6000U)
3154 #define _MM_ROUND_MASK (0x6000U)
3155
3156 #define _MM_FLUSH_ZERO_MASK (0x8000U)
3157 #define _MM_FLUSH_ZERO_ON (0x8000U)
3158 #define _MM_FLUSH_ZERO_OFF (0x0000U)
3159
3160 #define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
3161 #define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
3162 #define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
3163 #define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
3164
3165 #define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
3166 #define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
3167 #define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
3168 #define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
3169
3170 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
3171 do { \
3172 __m128 tmp3, tmp2, tmp1, tmp0; \
3173 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
3174 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
3175 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
3176 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
3177 (row0) = _mm_movelh_ps(tmp0, tmp2); \
3178 (row1) = _mm_movehl_ps(tmp2, tmp0); \
3179 (row2) = _mm_movelh_ps(tmp1, tmp3); \
3180 (row3) = _mm_movehl_ps(tmp3, tmp1); \
3181 } while (0)
3182
3183 /* Aliases for compatibility. */
3184 #define _m_pextrw _mm_extract_pi16
3185 #define _m_pinsrw _mm_insert_pi16
3186 #define _m_pmaxsw _mm_max_pi16
3187 #define _m_pmaxub _mm_max_pu8
3188 #define _m_pminsw _mm_min_pi16
3189 #define _m_pminub _mm_min_pu8
3190 #define _m_pmovmskb _mm_movemask_pi8
3191 #define _m_pmulhuw _mm_mulhi_pu16
3192 #define _m_pshufw _mm_shuffle_pi16
3193 #define _m_maskmovq _mm_maskmove_si64
3194 #define _m_pavgb _mm_avg_pu8
3195 #define _m_pavgw _mm_avg_pu16
3196 #define _m_psadbw _mm_sad_pu8
3197 #define _m_ _mm_
3198
3199 #undef __DEFAULT_FN_ATTRS
3200 #undef __DEFAULT_FN_ATTRS_MMX
3201
3202 /* Ugly hack for backwards-compatibility (compatible with gcc) */
3203 #if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
3204 #include <emmintrin.h>
3205 #endif
3206
3207 #endif /* __XMMINTRIN_H */
3208