1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10 #ifndef __EMMINTRIN_H
11 #define __EMMINTRIN_H
12
13 #if !defined(__i386__) && !defined(__x86_64__)
14 #error "This header is only meant to be used on x86 and x64 architecture"
15 #endif
16
17 #include <xmmintrin.h>
18
19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
21
22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
23 typedef long long __m128i_u
24 __attribute__((__vector_size__(16), __aligned__(1)));
25
26 /* Type defines. */
27 typedef double __v2df __attribute__((__vector_size__(16)));
28 typedef long long __v2di __attribute__((__vector_size__(16)));
29 typedef short __v8hi __attribute__((__vector_size__(16)));
30 typedef char __v16qi __attribute__((__vector_size__(16)));
31
32 /* Unsigned types */
33 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
34 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
35 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
36
37 /* We need an explicitly signed variant for char. Note that this shouldn't
38 * appear in the interface though. */
39 typedef signed char __v16qs __attribute__((__vector_size__(16)));
40
41 #ifdef __SSE2__
42 /* Both _Float16 and __bf16 require SSE2 being enabled. */
43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
46
47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
49 #endif
50
51 /* Define the default attributes for the functions in this file. */
52 #define __DEFAULT_FN_ATTRS \
53 __attribute__((__always_inline__, __nodebug__, \
54 __target__("sse2,no-evex512"), __min_vector_width__(128)))
55 #define __DEFAULT_FN_ATTRS_MMX \
56 __attribute__((__always_inline__, __nodebug__, \
57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64)))
58
59 /// Adds lower double-precision values in both operands and returns the
60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result
61 /// are copied from the upper double-precision value of the first operand.
62 ///
63 /// \headerfile <x86intrin.h>
64 ///
65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
66 ///
67 /// \param __a
68 /// A 128-bit vector of [2 x double] containing one of the source operands.
69 /// \param __b
70 /// A 128-bit vector of [2 x double] containing one of the source operands.
71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied
73 /// from the upper 64 bits of the first source operand.
_mm_add_sd(__m128d __a,__m128d __b)74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
75 __m128d __b) {
76 __a[0] += __b[0];
77 return __a;
78 }
79
80 /// Adds two 128-bit vectors of [2 x double].
81 ///
82 /// \headerfile <x86intrin.h>
83 ///
84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
85 ///
86 /// \param __a
87 /// A 128-bit vector of [2 x double] containing one of the source operands.
88 /// \param __b
89 /// A 128-bit vector of [2 x double] containing one of the source operands.
90 /// \returns A 128-bit vector of [2 x double] containing the sums of both
91 /// operands.
_mm_add_pd(__m128d __a,__m128d __b)92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
93 __m128d __b) {
94 return (__m128d)((__v2df)__a + (__v2df)__b);
95 }
96
97 /// Subtracts the lower double-precision value of the second operand
98 /// from the lower double-precision value of the first operand and returns
99 /// the difference in the lower 64 bits of the result. The upper 64 bits of
100 /// the result are copied from the upper double-precision value of the first
101 /// operand.
102 ///
103 /// \headerfile <x86intrin.h>
104 ///
105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
106 ///
107 /// \param __a
108 /// A 128-bit vector of [2 x double] containing the minuend.
109 /// \param __b
110 /// A 128-bit vector of [2 x double] containing the subtrahend.
111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
112 /// difference of the lower 64 bits of both operands. The upper 64 bits are
113 /// copied from the upper 64 bits of the first source operand.
_mm_sub_sd(__m128d __a,__m128d __b)114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
115 __m128d __b) {
116 __a[0] -= __b[0];
117 return __a;
118 }
119
120 /// Subtracts two 128-bit vectors of [2 x double].
121 ///
122 /// \headerfile <x86intrin.h>
123 ///
124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
125 ///
126 /// \param __a
127 /// A 128-bit vector of [2 x double] containing the minuend.
128 /// \param __b
129 /// A 128-bit vector of [2 x double] containing the subtrahend.
130 /// \returns A 128-bit vector of [2 x double] containing the differences between
131 /// both operands.
_mm_sub_pd(__m128d __a,__m128d __b)132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
133 __m128d __b) {
134 return (__m128d)((__v2df)__a - (__v2df)__b);
135 }
136
137 /// Multiplies lower double-precision values in both operands and returns
138 /// the product in the lower 64 bits of the result. The upper 64 bits of the
139 /// result are copied from the upper double-precision value of the first
140 /// operand.
141 ///
142 /// \headerfile <x86intrin.h>
143 ///
144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
145 ///
146 /// \param __a
147 /// A 128-bit vector of [2 x double] containing one of the source operands.
148 /// \param __b
149 /// A 128-bit vector of [2 x double] containing one of the source operands.
150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
151 /// product of the lower 64 bits of both operands. The upper 64 bits are
152 /// copied from the upper 64 bits of the first source operand.
_mm_mul_sd(__m128d __a,__m128d __b)153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
154 __m128d __b) {
155 __a[0] *= __b[0];
156 return __a;
157 }
158
159 /// Multiplies two 128-bit vectors of [2 x double].
160 ///
161 /// \headerfile <x86intrin.h>
162 ///
163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
164 ///
165 /// \param __a
166 /// A 128-bit vector of [2 x double] containing one of the operands.
167 /// \param __b
168 /// A 128-bit vector of [2 x double] containing one of the operands.
169 /// \returns A 128-bit vector of [2 x double] containing the products of both
170 /// operands.
_mm_mul_pd(__m128d __a,__m128d __b)171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
172 __m128d __b) {
173 return (__m128d)((__v2df)__a * (__v2df)__b);
174 }
175
176 /// Divides the lower double-precision value of the first operand by the
177 /// lower double-precision value of the second operand and returns the
178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the
179 /// result are copied from the upper double-precision value of the first
180 /// operand.
181 ///
182 /// \headerfile <x86intrin.h>
183 ///
184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
185 ///
186 /// \param __a
187 /// A 128-bit vector of [2 x double] containing the dividend.
188 /// \param __b
189 /// A 128-bit vector of [2 x double] containing divisor.
190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are
192 /// copied from the upper 64 bits of the first source operand.
_mm_div_sd(__m128d __a,__m128d __b)193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
194 __m128d __b) {
195 __a[0] /= __b[0];
196 return __a;
197 }
198
199 /// Performs an element-by-element division of two 128-bit vectors of
200 /// [2 x double].
201 ///
202 /// \headerfile <x86intrin.h>
203 ///
204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
205 ///
206 /// \param __a
207 /// A 128-bit vector of [2 x double] containing the dividend.
208 /// \param __b
209 /// A 128-bit vector of [2 x double] containing the divisor.
210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both
211 /// operands.
_mm_div_pd(__m128d __a,__m128d __b)212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
213 __m128d __b) {
214 return (__m128d)((__v2df)__a / (__v2df)__b);
215 }
216
217 /// Calculates the square root of the lower double-precision value of
218 /// the second operand and returns it in the lower 64 bits of the result.
219 /// The upper 64 bits of the result are copied from the upper
220 /// double-precision value of the first operand.
221 ///
222 /// \headerfile <x86intrin.h>
223 ///
224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
225 ///
226 /// \param __a
227 /// A 128-bit vector of [2 x double] containing one of the operands. The
228 /// upper 64 bits of this operand are copied to the upper 64 bits of the
229 /// result.
230 /// \param __b
231 /// A 128-bit vector of [2 x double] containing one of the operands. The
232 /// square root is calculated using the lower 64 bits of this operand.
233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64
235 /// bits are copied from the upper 64 bits of operand \a __a.
_mm_sqrt_sd(__m128d __a,__m128d __b)236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
237 __m128d __b) {
238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
239 return __extension__(__m128d){__c[0], __a[1]};
240 }
241
242 /// Calculates the square root of the each of two values stored in a
243 /// 128-bit vector of [2 x double].
244 ///
245 /// \headerfile <x86intrin.h>
246 ///
247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
248 ///
249 /// \param __a
250 /// A 128-bit vector of [2 x double].
251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the
252 /// values in the operand.
_mm_sqrt_pd(__m128d __a)253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
254 return __builtin_ia32_sqrtpd((__v2df)__a);
255 }
256
257 /// Compares lower 64-bit double-precision values of both operands, and
258 /// returns the lesser of the pair of values in the lower 64-bits of the
259 /// result. The upper 64 bits of the result are copied from the upper
260 /// double-precision value of the first operand.
261 ///
262 /// If either value in a comparison is NaN, returns the value from \a __b.
263 ///
264 /// \headerfile <x86intrin.h>
265 ///
266 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
267 ///
268 /// \param __a
269 /// A 128-bit vector of [2 x double] containing one of the operands. The
270 /// lower 64 bits of this operand are used in the comparison.
271 /// \param __b
272 /// A 128-bit vector of [2 x double] containing one of the operands. The
273 /// lower 64 bits of this operand are used in the comparison.
274 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
275 /// minimum value between both operands. The upper 64 bits are copied from
276 /// the upper 64 bits of the first source operand.
_mm_min_sd(__m128d __a,__m128d __b)277 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
278 __m128d __b) {
279 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
280 }
281
282 /// Performs element-by-element comparison of the two 128-bit vectors of
283 /// [2 x double] and returns a vector containing the lesser of each pair of
284 /// values.
285 ///
286 /// If either value in a comparison is NaN, returns the value from \a __b.
287 ///
288 /// \headerfile <x86intrin.h>
289 ///
290 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
291 ///
292 /// \param __a
293 /// A 128-bit vector of [2 x double] containing one of the operands.
294 /// \param __b
295 /// A 128-bit vector of [2 x double] containing one of the operands.
296 /// \returns A 128-bit vector of [2 x double] containing the minimum values
297 /// between both operands.
_mm_min_pd(__m128d __a,__m128d __b)298 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
299 __m128d __b) {
300 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
301 }
302
303 /// Compares lower 64-bit double-precision values of both operands, and
304 /// returns the greater of the pair of values in the lower 64-bits of the
305 /// result. The upper 64 bits of the result are copied from the upper
306 /// double-precision value of the first operand.
307 ///
308 /// If either value in a comparison is NaN, returns the value from \a __b.
309 ///
310 /// \headerfile <x86intrin.h>
311 ///
312 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
313 ///
314 /// \param __a
315 /// A 128-bit vector of [2 x double] containing one of the operands. The
316 /// lower 64 bits of this operand are used in the comparison.
317 /// \param __b
318 /// A 128-bit vector of [2 x double] containing one of the operands. The
319 /// lower 64 bits of this operand are used in the comparison.
320 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
321 /// maximum value between both operands. The upper 64 bits are copied from
322 /// the upper 64 bits of the first source operand.
_mm_max_sd(__m128d __a,__m128d __b)323 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
324 __m128d __b) {
325 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
326 }
327
328 /// Performs element-by-element comparison of the two 128-bit vectors of
329 /// [2 x double] and returns a vector containing the greater of each pair
330 /// of values.
331 ///
332 /// If either value in a comparison is NaN, returns the value from \a __b.
333 ///
334 /// \headerfile <x86intrin.h>
335 ///
336 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
337 ///
338 /// \param __a
339 /// A 128-bit vector of [2 x double] containing one of the operands.
340 /// \param __b
341 /// A 128-bit vector of [2 x double] containing one of the operands.
342 /// \returns A 128-bit vector of [2 x double] containing the maximum values
343 /// between both operands.
_mm_max_pd(__m128d __a,__m128d __b)344 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
345 __m128d __b) {
346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
347 }
348
349 /// Performs a bitwise AND of two 128-bit vectors of [2 x double].
350 ///
351 /// \headerfile <x86intrin.h>
352 ///
353 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
354 ///
355 /// \param __a
356 /// A 128-bit vector of [2 x double] containing one of the source operands.
357 /// \param __b
358 /// A 128-bit vector of [2 x double] containing one of the source operands.
359 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
360 /// values between both operands.
_mm_and_pd(__m128d __a,__m128d __b)361 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
362 __m128d __b) {
363 return (__m128d)((__v2du)__a & (__v2du)__b);
364 }
365
366 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
367 /// the one's complement of the values contained in the first source operand.
368 ///
369 /// \headerfile <x86intrin.h>
370 ///
371 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
372 ///
373 /// \param __a
374 /// A 128-bit vector of [2 x double] containing the left source operand. The
375 /// one's complement of this value is used in the bitwise AND.
376 /// \param __b
377 /// A 128-bit vector of [2 x double] containing the right source operand.
378 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
379 /// values in the second operand and the one's complement of the first
380 /// operand.
_mm_andnot_pd(__m128d __a,__m128d __b)381 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
382 __m128d __b) {
383 return (__m128d)(~(__v2du)__a & (__v2du)__b);
384 }
385
386 /// Performs a bitwise OR of two 128-bit vectors of [2 x double].
387 ///
388 /// \headerfile <x86intrin.h>
389 ///
390 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
391 ///
392 /// \param __a
393 /// A 128-bit vector of [2 x double] containing one of the source operands.
394 /// \param __b
395 /// A 128-bit vector of [2 x double] containing one of the source operands.
396 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
397 /// values between both operands.
_mm_or_pd(__m128d __a,__m128d __b)398 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
399 __m128d __b) {
400 return (__m128d)((__v2du)__a | (__v2du)__b);
401 }
402
403 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
404 ///
405 /// \headerfile <x86intrin.h>
406 ///
407 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
408 ///
409 /// \param __a
410 /// A 128-bit vector of [2 x double] containing one of the source operands.
411 /// \param __b
412 /// A 128-bit vector of [2 x double] containing one of the source operands.
413 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
414 /// values between both operands.
_mm_xor_pd(__m128d __a,__m128d __b)415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
416 __m128d __b) {
417 return (__m128d)((__v2du)__a ^ (__v2du)__b);
418 }
419
420 /// Compares each of the corresponding double-precision values of the
421 /// 128-bit vectors of [2 x double] for equality.
422 ///
423 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
424 /// If either value in a comparison is NaN, returns false.
425 ///
426 /// \headerfile <x86intrin.h>
427 ///
428 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
429 ///
430 /// \param __a
431 /// A 128-bit vector of [2 x double].
432 /// \param __b
433 /// A 128-bit vector of [2 x double].
434 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpeq_pd(__m128d __a,__m128d __b)435 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
436 __m128d __b) {
437 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
438 }
439
440 /// Compares each of the corresponding double-precision values of the
441 /// 128-bit vectors of [2 x double] to determine if the values in the first
442 /// operand are less than those in the second operand.
443 ///
444 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
445 /// If either value in a comparison is NaN, returns false.
446 ///
447 /// \headerfile <x86intrin.h>
448 ///
449 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
450 ///
451 /// \param __a
452 /// A 128-bit vector of [2 x double].
453 /// \param __b
454 /// A 128-bit vector of [2 x double].
455 /// \returns A 128-bit vector containing the comparison results.
_mm_cmplt_pd(__m128d __a,__m128d __b)456 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
457 __m128d __b) {
458 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
459 }
460
461 /// Compares each of the corresponding double-precision values of the
462 /// 128-bit vectors of [2 x double] to determine if the values in the first
463 /// operand are less than or equal to those in the second operand.
464 ///
465 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
466 /// If either value in a comparison is NaN, returns false.
467 ///
468 /// \headerfile <x86intrin.h>
469 ///
470 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
471 ///
472 /// \param __a
473 /// A 128-bit vector of [2 x double].
474 /// \param __b
475 /// A 128-bit vector of [2 x double].
476 /// \returns A 128-bit vector containing the comparison results.
_mm_cmple_pd(__m128d __a,__m128d __b)477 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
478 __m128d __b) {
479 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
480 }
481
482 /// Compares each of the corresponding double-precision values of the
483 /// 128-bit vectors of [2 x double] to determine if the values in the first
484 /// operand are greater than those in the second operand.
485 ///
486 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
487 /// If either value in a comparison is NaN, returns false.
488 ///
489 /// \headerfile <x86intrin.h>
490 ///
491 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
492 ///
493 /// \param __a
494 /// A 128-bit vector of [2 x double].
495 /// \param __b
496 /// A 128-bit vector of [2 x double].
497 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpgt_pd(__m128d __a,__m128d __b)498 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
499 __m128d __b) {
500 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
501 }
502
503 /// Compares each of the corresponding double-precision values of the
504 /// 128-bit vectors of [2 x double] to determine if the values in the first
505 /// operand are greater than or equal to those in the second operand.
506 ///
507 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
508 /// If either value in a comparison is NaN, returns false.
509 ///
510 /// \headerfile <x86intrin.h>
511 ///
512 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
513 ///
514 /// \param __a
515 /// A 128-bit vector of [2 x double].
516 /// \param __b
517 /// A 128-bit vector of [2 x double].
518 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpge_pd(__m128d __a,__m128d __b)519 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
520 __m128d __b) {
521 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
522 }
523
524 /// Compares each of the corresponding double-precision values of the
525 /// 128-bit vectors of [2 x double] to determine if the values in the first
526 /// operand are ordered with respect to those in the second operand.
527 ///
528 /// A pair of double-precision values are ordered with respect to each
529 /// other if neither value is a NaN. Each comparison returns 0x0 for false,
530 /// 0xFFFFFFFFFFFFFFFF for true.
531 ///
532 /// \headerfile <x86intrin.h>
533 ///
534 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
535 ///
536 /// \param __a
537 /// A 128-bit vector of [2 x double].
538 /// \param __b
539 /// A 128-bit vector of [2 x double].
540 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpord_pd(__m128d __a,__m128d __b)541 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
542 __m128d __b) {
543 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
544 }
545
546 /// Compares each of the corresponding double-precision values of the
547 /// 128-bit vectors of [2 x double] to determine if the values in the first
548 /// operand are unordered with respect to those in the second operand.
549 ///
550 /// A pair of double-precision values are unordered with respect to each
551 /// other if one or both values are NaN. Each comparison returns 0x0 for
552 /// false, 0xFFFFFFFFFFFFFFFF for true.
553 ///
554 /// \headerfile <x86intrin.h>
555 ///
556 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
557 /// instruction.
558 ///
559 /// \param __a
560 /// A 128-bit vector of [2 x double].
561 /// \param __b
562 /// A 128-bit vector of [2 x double].
563 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpunord_pd(__m128d __a,__m128d __b)564 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
565 __m128d __b) {
566 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
567 }
568
569 /// Compares each of the corresponding double-precision values of the
570 /// 128-bit vectors of [2 x double] to determine if the values in the first
571 /// operand are unequal to those in the second operand.
572 ///
573 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
574 /// If either value in a comparison is NaN, returns true.
575 ///
576 /// \headerfile <x86intrin.h>
577 ///
578 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
579 ///
580 /// \param __a
581 /// A 128-bit vector of [2 x double].
582 /// \param __b
583 /// A 128-bit vector of [2 x double].
584 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpneq_pd(__m128d __a,__m128d __b)585 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
586 __m128d __b) {
587 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
588 }
589
590 /// Compares each of the corresponding double-precision values of the
591 /// 128-bit vectors of [2 x double] to determine if the values in the first
592 /// operand are not less than those in the second operand.
593 ///
594 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
595 /// If either value in a comparison is NaN, returns true.
596 ///
597 /// \headerfile <x86intrin.h>
598 ///
599 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
600 ///
601 /// \param __a
602 /// A 128-bit vector of [2 x double].
603 /// \param __b
604 /// A 128-bit vector of [2 x double].
605 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnlt_pd(__m128d __a,__m128d __b)606 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
607 __m128d __b) {
608 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
609 }
610
611 /// Compares each of the corresponding double-precision values of the
612 /// 128-bit vectors of [2 x double] to determine if the values in the first
613 /// operand are not less than or equal to those in the second operand.
614 ///
615 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
616 /// If either value in a comparison is NaN, returns true.
617 ///
618 /// \headerfile <x86intrin.h>
619 ///
620 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
621 ///
622 /// \param __a
623 /// A 128-bit vector of [2 x double].
624 /// \param __b
625 /// A 128-bit vector of [2 x double].
626 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnle_pd(__m128d __a,__m128d __b)627 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
628 __m128d __b) {
629 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
630 }
631
632 /// Compares each of the corresponding double-precision values of the
633 /// 128-bit vectors of [2 x double] to determine if the values in the first
634 /// operand are not greater than those in the second operand.
635 ///
636 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
637 /// If either value in a comparison is NaN, returns true.
638 ///
639 /// \headerfile <x86intrin.h>
640 ///
641 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
642 ///
643 /// \param __a
644 /// A 128-bit vector of [2 x double].
645 /// \param __b
646 /// A 128-bit vector of [2 x double].
647 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpngt_pd(__m128d __a,__m128d __b)648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
649 __m128d __b) {
650 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
651 }
652
653 /// Compares each of the corresponding double-precision values of the
654 /// 128-bit vectors of [2 x double] to determine if the values in the first
655 /// operand are not greater than or equal to those in the second operand.
656 ///
657 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
658 /// If either value in a comparison is NaN, returns true.
659 ///
660 /// \headerfile <x86intrin.h>
661 ///
662 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
663 ///
664 /// \param __a
665 /// A 128-bit vector of [2 x double].
666 /// \param __b
667 /// A 128-bit vector of [2 x double].
668 /// \returns A 128-bit vector containing the comparison results.
_mm_cmpnge_pd(__m128d __a,__m128d __b)669 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
670 __m128d __b) {
671 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
672 }
673
674 /// Compares the lower double-precision floating-point values in each of
675 /// the two 128-bit floating-point vectors of [2 x double] for equality.
676 ///
677 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
678 /// If either value in a comparison is NaN, returns false.
679 ///
680 /// \headerfile <x86intrin.h>
681 ///
682 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
683 ///
684 /// \param __a
685 /// A 128-bit vector of [2 x double]. The lower double-precision value is
686 /// compared to the lower double-precision value of \a __b.
687 /// \param __b
688 /// A 128-bit vector of [2 x double]. The lower double-precision value is
689 /// compared to the lower double-precision value of \a __a.
690 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
691 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpeq_sd(__m128d __a,__m128d __b)692 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
693 __m128d __b) {
694 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
695 }
696
697 /// Compares the lower double-precision floating-point values in each of
698 /// the two 128-bit floating-point vectors of [2 x double] to determine if
699 /// the value in the first parameter is less than the corresponding value in
700 /// the second parameter.
701 ///
702 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
703 /// If either value in a comparison is NaN, returns false.
704 ///
705 /// \headerfile <x86intrin.h>
706 ///
707 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
708 ///
709 /// \param __a
710 /// A 128-bit vector of [2 x double]. The lower double-precision value is
711 /// compared to the lower double-precision value of \a __b.
712 /// \param __b
713 /// A 128-bit vector of [2 x double]. The lower double-precision value is
714 /// compared to the lower double-precision value of \a __a.
715 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
716 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmplt_sd(__m128d __a,__m128d __b)717 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
718 __m128d __b) {
719 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
720 }
721
722 /// Compares the lower double-precision floating-point values in each of
723 /// the two 128-bit floating-point vectors of [2 x double] to determine if
724 /// the value in the first parameter is less than or equal to the
725 /// corresponding value in the second parameter.
726 ///
727 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
728 /// If either value in a comparison is NaN, returns false.
729 ///
730 /// \headerfile <x86intrin.h>
731 ///
732 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
733 ///
734 /// \param __a
735 /// A 128-bit vector of [2 x double]. The lower double-precision value is
736 /// compared to the lower double-precision value of \a __b.
737 /// \param __b
738 /// A 128-bit vector of [2 x double]. The lower double-precision value is
739 /// compared to the lower double-precision value of \a __a.
740 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
741 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmple_sd(__m128d __a,__m128d __b)742 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
743 __m128d __b) {
744 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
745 }
746
747 /// Compares the lower double-precision floating-point values in each of
748 /// the two 128-bit floating-point vectors of [2 x double] to determine if
749 /// the value in the first parameter is greater than the corresponding value
750 /// in the second parameter.
751 ///
752 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
753 /// If either value in a comparison is NaN, returns false.
754 ///
755 /// \headerfile <x86intrin.h>
756 ///
757 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
758 ///
759 /// \param __a
760 /// A 128-bit vector of [2 x double]. The lower double-precision value is
761 /// compared to the lower double-precision value of \a __b.
762 /// \param __b
763 /// A 128-bit vector of [2 x double]. The lower double-precision value is
764 /// compared to the lower double-precision value of \a __a.
765 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
766 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpgt_sd(__m128d __a,__m128d __b)767 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
768 __m128d __b) {
769 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
770 return __extension__(__m128d){__c[0], __a[1]};
771 }
772
773 /// Compares the lower double-precision floating-point values in each of
774 /// the two 128-bit floating-point vectors of [2 x double] to determine if
775 /// the value in the first parameter is greater than or equal to the
776 /// corresponding value in the second parameter.
777 ///
778 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
779 /// If either value in a comparison is NaN, returns false.
780 ///
781 /// \headerfile <x86intrin.h>
782 ///
783 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
784 ///
785 /// \param __a
786 /// A 128-bit vector of [2 x double]. The lower double-precision value is
787 /// compared to the lower double-precision value of \a __b.
788 /// \param __b
789 /// A 128-bit vector of [2 x double]. The lower double-precision value is
790 /// compared to the lower double-precision value of \a __a.
791 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
792 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpge_sd(__m128d __a,__m128d __b)793 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
794 __m128d __b) {
795 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
796 return __extension__(__m128d){__c[0], __a[1]};
797 }
798
799 /// Compares the lower double-precision floating-point values in each of
800 /// the two 128-bit floating-point vectors of [2 x double] to determine if
801 /// the value in the first parameter is ordered with respect to the
802 /// corresponding value in the second parameter.
803 ///
804 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
805 /// of double-precision values are ordered with respect to each other if
806 /// neither value is a NaN.
807 ///
808 /// \headerfile <x86intrin.h>
809 ///
810 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
811 ///
812 /// \param __a
813 /// A 128-bit vector of [2 x double]. The lower double-precision value is
814 /// compared to the lower double-precision value of \a __b.
815 /// \param __b
816 /// A 128-bit vector of [2 x double]. The lower double-precision value is
817 /// compared to the lower double-precision value of \a __a.
818 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
819 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpord_sd(__m128d __a,__m128d __b)820 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
821 __m128d __b) {
822 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
823 }
824
825 /// Compares the lower double-precision floating-point values in each of
826 /// the two 128-bit floating-point vectors of [2 x double] to determine if
827 /// the value in the first parameter is unordered with respect to the
828 /// corresponding value in the second parameter.
829 ///
830 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
831 /// of double-precision values are unordered with respect to each other if
832 /// one or both values are NaN.
833 ///
834 /// \headerfile <x86intrin.h>
835 ///
836 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
837 /// instruction.
838 ///
839 /// \param __a
840 /// A 128-bit vector of [2 x double]. The lower double-precision value is
841 /// compared to the lower double-precision value of \a __b.
842 /// \param __b
843 /// A 128-bit vector of [2 x double]. The lower double-precision value is
844 /// compared to the lower double-precision value of \a __a.
845 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
846 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpunord_sd(__m128d __a,__m128d __b)847 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
848 __m128d __b) {
849 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
850 }
851
852 /// Compares the lower double-precision floating-point values in each of
853 /// the two 128-bit floating-point vectors of [2 x double] to determine if
854 /// the value in the first parameter is unequal to the corresponding value in
855 /// the second parameter.
856 ///
857 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
858 /// If either value in a comparison is NaN, returns true.
859 ///
860 /// \headerfile <x86intrin.h>
861 ///
862 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
863 ///
864 /// \param __a
865 /// A 128-bit vector of [2 x double]. The lower double-precision value is
866 /// compared to the lower double-precision value of \a __b.
867 /// \param __b
868 /// A 128-bit vector of [2 x double]. The lower double-precision value is
869 /// compared to the lower double-precision value of \a __a.
870 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
871 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpneq_sd(__m128d __a,__m128d __b)872 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
873 __m128d __b) {
874 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
875 }
876
877 /// Compares the lower double-precision floating-point values in each of
878 /// the two 128-bit floating-point vectors of [2 x double] to determine if
879 /// the value in the first parameter is not less than the corresponding
880 /// value in the second parameter.
881 ///
882 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
883 /// If either value in a comparison is NaN, returns true.
884 ///
885 /// \headerfile <x86intrin.h>
886 ///
887 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
888 ///
889 /// \param __a
890 /// A 128-bit vector of [2 x double]. The lower double-precision value is
891 /// compared to the lower double-precision value of \a __b.
892 /// \param __b
893 /// A 128-bit vector of [2 x double]. The lower double-precision value is
894 /// compared to the lower double-precision value of \a __a.
895 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
896 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnlt_sd(__m128d __a,__m128d __b)897 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
898 __m128d __b) {
899 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
900 }
901
902 /// Compares the lower double-precision floating-point values in each of
903 /// the two 128-bit floating-point vectors of [2 x double] to determine if
904 /// the value in the first parameter is not less than or equal to the
905 /// corresponding value in the second parameter.
906 ///
907 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
908 /// If either value in a comparison is NaN, returns true.
909 ///
910 /// \headerfile <x86intrin.h>
911 ///
912 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
913 ///
914 /// \param __a
915 /// A 128-bit vector of [2 x double]. The lower double-precision value is
916 /// compared to the lower double-precision value of \a __b.
917 /// \param __b
918 /// A 128-bit vector of [2 x double]. The lower double-precision value is
919 /// compared to the lower double-precision value of \a __a.
920 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
921 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnle_sd(__m128d __a,__m128d __b)922 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
923 __m128d __b) {
924 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
925 }
926
927 /// Compares the lower double-precision floating-point values in each of
928 /// the two 128-bit floating-point vectors of [2 x double] to determine if
929 /// the value in the first parameter is not greater than the corresponding
930 /// value in the second parameter.
931 ///
932 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
933 /// If either value in a comparison is NaN, returns true.
934 ///
935 /// \headerfile <x86intrin.h>
936 ///
937 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
938 ///
939 /// \param __a
940 /// A 128-bit vector of [2 x double]. The lower double-precision value is
941 /// compared to the lower double-precision value of \a __b.
942 /// \param __b
943 /// A 128-bit vector of [2 x double]. The lower double-precision value is
944 /// compared to the lower double-precision value of \a __a.
945 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
946 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpngt_sd(__m128d __a,__m128d __b)947 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
948 __m128d __b) {
949 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
950 return __extension__(__m128d){__c[0], __a[1]};
951 }
952
953 /// Compares the lower double-precision floating-point values in each of
954 /// the two 128-bit floating-point vectors of [2 x double] to determine if
955 /// the value in the first parameter is not greater than or equal to the
956 /// corresponding value in the second parameter.
957 ///
958 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
959 /// If either value in a comparison is NaN, returns true.
960 ///
961 /// \headerfile <x86intrin.h>
962 ///
963 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
964 ///
965 /// \param __a
966 /// A 128-bit vector of [2 x double]. The lower double-precision value is
967 /// compared to the lower double-precision value of \a __b.
968 /// \param __b
969 /// A 128-bit vector of [2 x double]. The lower double-precision value is
970 /// compared to the lower double-precision value of \a __a.
971 /// \returns A 128-bit vector. The lower 64 bits contains the comparison
972 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
_mm_cmpnge_sd(__m128d __a,__m128d __b)973 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
974 __m128d __b) {
975 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
976 return __extension__(__m128d){__c[0], __a[1]};
977 }
978
979 /// Compares the lower double-precision floating-point values in each of
980 /// the two 128-bit floating-point vectors of [2 x double] for equality.
981 ///
982 /// The comparison returns 0 for false, 1 for true. If either value in a
983 /// comparison is NaN, returns 0.
984 ///
985 /// \headerfile <x86intrin.h>
986 ///
987 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
988 ///
989 /// \param __a
990 /// A 128-bit vector of [2 x double]. The lower double-precision value is
991 /// compared to the lower double-precision value of \a __b.
992 /// \param __b
993 /// A 128-bit vector of [2 x double]. The lower double-precision value is
994 /// compared to the lower double-precision value of \a __a.
995 /// \returns An integer containing the comparison results.
_mm_comieq_sd(__m128d __a,__m128d __b)996 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
997 __m128d __b) {
998 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
999 }
1000
1001 /// Compares the lower double-precision floating-point values in each of
1002 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1003 /// the value in the first parameter is less than the corresponding value in
1004 /// the second parameter.
1005 ///
1006 /// The comparison returns 0 for false, 1 for true. If either value in a
1007 /// comparison is NaN, returns 0.
1008 ///
1009 /// \headerfile <x86intrin.h>
1010 ///
1011 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1012 ///
1013 /// \param __a
1014 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1015 /// compared to the lower double-precision value of \a __b.
1016 /// \param __b
1017 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1018 /// compared to the lower double-precision value of \a __a.
1019 /// \returns An integer containing the comparison results.
_mm_comilt_sd(__m128d __a,__m128d __b)1020 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
1021 __m128d __b) {
1022 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
1023 }
1024
1025 /// Compares the lower double-precision floating-point values in each of
1026 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1027 /// the value in the first parameter is less than or equal to the
1028 /// corresponding value in the second parameter.
1029 ///
1030 /// The comparison returns 0 for false, 1 for true. If either value in a
1031 /// comparison is NaN, returns 0.
1032 ///
1033 /// \headerfile <x86intrin.h>
1034 ///
1035 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1036 ///
1037 /// \param __a
1038 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1039 /// compared to the lower double-precision value of \a __b.
1040 /// \param __b
1041 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1042 /// compared to the lower double-precision value of \a __a.
1043 /// \returns An integer containing the comparison results.
_mm_comile_sd(__m128d __a,__m128d __b)1044 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
1045 __m128d __b) {
1046 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
1047 }
1048
1049 /// Compares the lower double-precision floating-point values in each of
1050 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1051 /// the value in the first parameter is greater than the corresponding value
1052 /// in the second parameter.
1053 ///
1054 /// The comparison returns 0 for false, 1 for true. If either value in a
1055 /// comparison is NaN, returns 0.
1056 ///
1057 /// \headerfile <x86intrin.h>
1058 ///
1059 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1060 ///
1061 /// \param __a
1062 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1063 /// compared to the lower double-precision value of \a __b.
1064 /// \param __b
1065 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1066 /// compared to the lower double-precision value of \a __a.
1067 /// \returns An integer containing the comparison results.
_mm_comigt_sd(__m128d __a,__m128d __b)1068 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
1069 __m128d __b) {
1070 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
1071 }
1072
1073 /// Compares the lower double-precision floating-point values in each of
1074 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1075 /// the value in the first parameter is greater than or equal to the
1076 /// corresponding value in the second parameter.
1077 ///
1078 /// The comparison returns 0 for false, 1 for true. If either value in a
1079 /// comparison is NaN, returns 0.
1080 ///
1081 /// \headerfile <x86intrin.h>
1082 ///
1083 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1084 ///
1085 /// \param __a
1086 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1087 /// compared to the lower double-precision value of \a __b.
1088 /// \param __b
1089 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1090 /// compared to the lower double-precision value of \a __a.
1091 /// \returns An integer containing the comparison results.
_mm_comige_sd(__m128d __a,__m128d __b)1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
1093 __m128d __b) {
1094 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
1095 }
1096
1097 /// Compares the lower double-precision floating-point values in each of
1098 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1099 /// the value in the first parameter is unequal to the corresponding value in
1100 /// the second parameter.
1101 ///
1102 /// The comparison returns 0 for false, 1 for true. If either value in a
1103 /// comparison is NaN, returns 1.
1104 ///
1105 /// \headerfile <x86intrin.h>
1106 ///
1107 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
1108 ///
1109 /// \param __a
1110 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1111 /// compared to the lower double-precision value of \a __b.
1112 /// \param __b
1113 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1114 /// compared to the lower double-precision value of \a __a.
1115 /// \returns An integer containing the comparison results.
_mm_comineq_sd(__m128d __a,__m128d __b)1116 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
1117 __m128d __b) {
1118 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
1119 }
1120
1121 /// Compares the lower double-precision floating-point values in each of
1122 /// the two 128-bit floating-point vectors of [2 x double] for equality.
1123 ///
1124 /// The comparison returns 0 for false, 1 for true. If either value in a
1125 /// comparison is NaN, returns 0.
1126 ///
1127 /// \headerfile <x86intrin.h>
1128 ///
1129 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1130 ///
1131 /// \param __a
1132 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1133 /// compared to the lower double-precision value of \a __b.
1134 /// \param __b
1135 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1136 /// compared to the lower double-precision value of \a __a.
1137 /// \returns An integer containing the comparison results.
_mm_ucomieq_sd(__m128d __a,__m128d __b)1138 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
1139 __m128d __b) {
1140 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
1141 }
1142
1143 /// Compares the lower double-precision floating-point values in each of
1144 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1145 /// the value in the first parameter is less than the corresponding value in
1146 /// the second parameter.
1147 ///
1148 /// The comparison returns 0 for false, 1 for true. If either value in a
1149 /// comparison is NaN, returns 0.
1150 ///
1151 /// \headerfile <x86intrin.h>
1152 ///
1153 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1154 ///
1155 /// \param __a
1156 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1157 /// compared to the lower double-precision value of \a __b.
1158 /// \param __b
1159 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1160 /// compared to the lower double-precision value of \a __a.
1161 /// \returns An integer containing the comparison results.
_mm_ucomilt_sd(__m128d __a,__m128d __b)1162 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
1163 __m128d __b) {
1164 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
1165 }
1166
1167 /// Compares the lower double-precision floating-point values in each of
1168 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1169 /// the value in the first parameter is less than or equal to the
1170 /// corresponding value in the second parameter.
1171 ///
1172 /// The comparison returns 0 for false, 1 for true. If either value in a
1173 /// comparison is NaN, returns 0.
1174 ///
1175 /// \headerfile <x86intrin.h>
1176 ///
1177 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1178 ///
1179 /// \param __a
1180 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1181 /// compared to the lower double-precision value of \a __b.
1182 /// \param __b
1183 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1184 /// compared to the lower double-precision value of \a __a.
1185 /// \returns An integer containing the comparison results.
_mm_ucomile_sd(__m128d __a,__m128d __b)1186 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
1187 __m128d __b) {
1188 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
1189 }
1190
1191 /// Compares the lower double-precision floating-point values in each of
1192 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1193 /// the value in the first parameter is greater than the corresponding value
1194 /// in the second parameter.
1195 ///
1196 /// The comparison returns 0 for false, 1 for true. If either value in a
1197 /// comparison is NaN, returns 0.
1198 ///
1199 /// \headerfile <x86intrin.h>
1200 ///
1201 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1202 ///
1203 /// \param __a
1204 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1205 /// compared to the lower double-precision value of \a __b.
1206 /// \param __b
1207 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1208 /// compared to the lower double-precision value of \a __a.
1209 /// \returns An integer containing the comparison results.
_mm_ucomigt_sd(__m128d __a,__m128d __b)1210 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
1211 __m128d __b) {
1212 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
1213 }
1214
1215 /// Compares the lower double-precision floating-point values in each of
1216 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1217 /// the value in the first parameter is greater than or equal to the
1218 /// corresponding value in the second parameter.
1219 ///
1220 /// The comparison returns 0 for false, 1 for true. If either value in a
1221 /// comparison is NaN, returns 0.
1222 ///
1223 /// \headerfile <x86intrin.h>
1224 ///
1225 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1226 ///
1227 /// \param __a
1228 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1229 /// compared to the lower double-precision value of \a __b.
1230 /// \param __b
1231 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1232 /// compared to the lower double-precision value of \a __a.
1233 /// \returns An integer containing the comparison results.
_mm_ucomige_sd(__m128d __a,__m128d __b)1234 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
1235 __m128d __b) {
1236 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
1237 }
1238
1239 /// Compares the lower double-precision floating-point values in each of
1240 /// the two 128-bit floating-point vectors of [2 x double] to determine if
1241 /// the value in the first parameter is unequal to the corresponding value in
1242 /// the second parameter.
1243 ///
1244 /// The comparison returns 0 for false, 1 for true. If either value in a
1245 /// comparison is NaN, returns 1.
1246 ///
1247 /// \headerfile <x86intrin.h>
1248 ///
1249 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
1250 ///
1251 /// \param __a
1252 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1253 /// compared to the lower double-precision value of \a __b.
1254 /// \param __b
1255 /// A 128-bit vector of [2 x double]. The lower double-precision value is
1256 /// compared to the lower double-precision value of \a __a.
1257 /// \returns An integer containing the comparison result.
_mm_ucomineq_sd(__m128d __a,__m128d __b)1258 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
1259 __m128d __b) {
1260 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
1261 }
1262
1263 /// Converts the two double-precision floating-point elements of a
1264 /// 128-bit vector of [2 x double] into two single-precision floating-point
1265 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
1266 /// The upper 64 bits of the result vector are set to zero.
1267 ///
1268 /// \headerfile <x86intrin.h>
1269 ///
1270 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
1271 ///
1272 /// \param __a
1273 /// A 128-bit vector of [2 x double].
1274 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
1275 /// converted values. The upper 64 bits are set to zero.
_mm_cvtpd_ps(__m128d __a)1276 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
1277 return __builtin_ia32_cvtpd2ps((__v2df)__a);
1278 }
1279
1280 /// Converts the lower two single-precision floating-point elements of a
1281 /// 128-bit vector of [4 x float] into two double-precision floating-point
1282 /// values, returned in a 128-bit vector of [2 x double]. The upper two
1283 /// elements of the input vector are unused.
1284 ///
1285 /// \headerfile <x86intrin.h>
1286 ///
1287 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
1288 ///
1289 /// \param __a
1290 /// A 128-bit vector of [4 x float]. The lower two single-precision
1291 /// floating-point elements are converted to double-precision values. The
1292 /// upper two elements are unused.
1293 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtps_pd(__m128 __a)1294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
1295 return (__m128d) __builtin_convertvector(
1296 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
1297 }
1298
1299 /// Converts the lower two integer elements of a 128-bit vector of
1300 /// [4 x i32] into two double-precision floating-point values, returned in a
1301 /// 128-bit vector of [2 x double].
1302 ///
1303 /// The upper two elements of the input vector are unused.
1304 ///
1305 /// \headerfile <x86intrin.h>
1306 ///
1307 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
1308 ///
1309 /// \param __a
1310 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
1311 /// converted to double-precision values.
1312 ///
1313 /// The upper two elements are unused.
1314 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtepi32_pd(__m128i __a)1315 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
1316 return (__m128d) __builtin_convertvector(
1317 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
1318 }
1319
1320 /// Converts the two double-precision floating-point elements of a
1321 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1322 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
1323 /// 64 bits of the result vector are set to zero.
1324 ///
1325 /// If a converted value does not fit in a 32-bit integer, raises a
1326 /// floating-point invalid exception. If the exception is masked, returns
1327 /// the most negative integer.
1328 ///
1329 /// \headerfile <x86intrin.h>
1330 ///
1331 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
1332 ///
1333 /// \param __a
1334 /// A 128-bit vector of [2 x double].
1335 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1336 /// converted values. The upper 64 bits are set to zero.
_mm_cvtpd_epi32(__m128d __a)1337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
1338 return __builtin_ia32_cvtpd2dq((__v2df)__a);
1339 }
1340
1341 /// Converts the low-order element of a 128-bit vector of [2 x double]
1342 /// into a 32-bit signed integer value.
1343 ///
1344 /// If the converted value does not fit in a 32-bit integer, raises a
1345 /// floating-point invalid exception. If the exception is masked, returns
1346 /// the most negative integer.
1347 ///
1348 /// \headerfile <x86intrin.h>
1349 ///
1350 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
1351 ///
1352 /// \param __a
1353 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1354 /// conversion.
1355 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvtsd_si32(__m128d __a)1356 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
1357 return __builtin_ia32_cvtsd2si((__v2df)__a);
1358 }
1359
1360 /// Converts the lower double-precision floating-point element of a
1361 /// 128-bit vector of [2 x double], in the second parameter, into a
1362 /// single-precision floating-point value, returned in the lower 32 bits of a
1363 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
1364 /// copied from the upper 96 bits of the first parameter.
1365 ///
1366 /// \headerfile <x86intrin.h>
1367 ///
1368 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
1369 ///
1370 /// \param __a
1371 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
1372 /// copied to the upper 96 bits of the result.
1373 /// \param __b
1374 /// A 128-bit vector of [2 x double]. The lower double-precision
1375 /// floating-point element is used in the conversion.
1376 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
1377 /// converted value from the second parameter. The upper 96 bits are copied
1378 /// from the upper 96 bits of the first parameter.
_mm_cvtsd_ss(__m128 __a,__m128d __b)1379 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
1380 __m128d __b) {
1381 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
1382 }
1383
1384 /// Converts a 32-bit signed integer value, in the second parameter, into
1385 /// a double-precision floating-point value, returned in the lower 64 bits of
1386 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1387 /// are copied from the upper 64 bits of the first parameter.
1388 ///
1389 /// \headerfile <x86intrin.h>
1390 ///
1391 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
1392 ///
1393 /// \param __a
1394 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1395 /// copied to the upper 64 bits of the result.
1396 /// \param __b
1397 /// A 32-bit signed integer containing the value to be converted.
1398 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1399 /// converted value from the second parameter. The upper 64 bits are copied
1400 /// from the upper 64 bits of the first parameter.
_mm_cvtsi32_sd(__m128d __a,int __b)1401 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
1402 int __b) {
1403 __a[0] = __b;
1404 return __a;
1405 }
1406
1407 /// Converts the lower single-precision floating-point element of a
1408 /// 128-bit vector of [4 x float], in the second parameter, into a
1409 /// double-precision floating-point value, returned in the lower 64 bits of
1410 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
1411 /// are copied from the upper 64 bits of the first parameter.
1412 ///
1413 /// \headerfile <x86intrin.h>
1414 ///
1415 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
1416 ///
1417 /// \param __a
1418 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
1419 /// copied to the upper 64 bits of the result.
1420 /// \param __b
1421 /// A 128-bit vector of [4 x float]. The lower single-precision
1422 /// floating-point element is used in the conversion.
1423 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
1424 /// converted value from the second parameter. The upper 64 bits are copied
1425 /// from the upper 64 bits of the first parameter.
_mm_cvtss_sd(__m128d __a,__m128 __b)1426 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
1427 __m128 __b) {
1428 __a[0] = __b[0];
1429 return __a;
1430 }
1431
1432 /// Converts the two double-precision floating-point elements of a
1433 /// 128-bit vector of [2 x double] into two signed truncated (rounded
1434 /// toward zero) 32-bit integer values, returned in the lower 64 bits
1435 /// of a 128-bit vector of [4 x i32].
1436 ///
1437 /// If a converted value does not fit in a 32-bit integer, raises a
1438 /// floating-point invalid exception. If the exception is masked, returns
1439 /// the most negative integer.
1440 ///
1441 /// \headerfile <x86intrin.h>
1442 ///
1443 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
1444 /// instruction.
1445 ///
1446 /// \param __a
1447 /// A 128-bit vector of [2 x double].
1448 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
1449 /// converted values. The upper 64 bits are set to zero.
_mm_cvttpd_epi32(__m128d __a)1450 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
1451 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
1452 }
1453
1454 /// Converts the low-order element of a [2 x double] vector into a 32-bit
1455 /// signed truncated (rounded toward zero) integer value.
1456 ///
1457 /// If the converted value does not fit in a 32-bit integer, raises a
1458 /// floating-point invalid exception. If the exception is masked, returns
1459 /// the most negative integer.
1460 ///
1461 /// \headerfile <x86intrin.h>
1462 ///
1463 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
1464 /// instruction.
1465 ///
1466 /// \param __a
1467 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
1468 /// conversion.
1469 /// \returns A 32-bit signed integer containing the converted value.
_mm_cvttsd_si32(__m128d __a)1470 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
1471 return __builtin_ia32_cvttsd2si((__v2df)__a);
1472 }
1473
1474 /// Converts the two double-precision floating-point elements of a
1475 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
1476 /// returned in a 64-bit vector of [2 x i32].
1477 ///
1478 /// If a converted value does not fit in a 32-bit integer, raises a
1479 /// floating-point invalid exception. If the exception is masked, returns
1480 /// the most negative integer.
1481 ///
1482 /// \headerfile <x86intrin.h>
1483 ///
1484 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
1485 ///
1486 /// \param __a
1487 /// A 128-bit vector of [2 x double].
1488 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvtpd_pi32(__m128d __a)1489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
1490 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
1491 }
1492
1493 /// Converts the two double-precision floating-point elements of a
1494 /// 128-bit vector of [2 x double] into two signed truncated (rounded toward
1495 /// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32].
1496 ///
1497 /// If a converted value does not fit in a 32-bit integer, raises a
1498 /// floating-point invalid exception. If the exception is masked, returns
1499 /// the most negative integer.
1500 ///
1501 /// \headerfile <x86intrin.h>
1502 ///
1503 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
1504 ///
1505 /// \param __a
1506 /// A 128-bit vector of [2 x double].
1507 /// \returns A 64-bit vector of [2 x i32] containing the converted values.
_mm_cvttpd_pi32(__m128d __a)1508 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
1509 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
1510 }
1511
1512 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
1513 /// [2 x i32] into two double-precision floating-point values, returned in a
1514 /// 128-bit vector of [2 x double].
1515 ///
1516 /// \headerfile <x86intrin.h>
1517 ///
1518 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
1519 ///
1520 /// \param __a
1521 /// A 64-bit vector of [2 x i32].
1522 /// \returns A 128-bit vector of [2 x double] containing the converted values.
_mm_cvtpi32_pd(__m64 __a)1523 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
1524 return __builtin_ia32_cvtpi2pd((__v2si)__a);
1525 }
1526
1527 /// Returns the low-order element of a 128-bit vector of [2 x double] as
1528 /// a double-precision floating-point value.
1529 ///
1530 /// \headerfile <x86intrin.h>
1531 ///
1532 /// This intrinsic has no corresponding instruction.
1533 ///
1534 /// \param __a
1535 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
1536 /// \returns A double-precision floating-point value copied from the lower 64
1537 /// bits of \a __a.
_mm_cvtsd_f64(__m128d __a)1538 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
1539 return __a[0];
1540 }
1541
1542 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned
1543 /// memory location.
1544 ///
1545 /// \headerfile <x86intrin.h>
1546 ///
1547 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
1548 ///
1549 /// \param __dp
1550 /// A pointer to a 128-bit memory location. The address of the memory
1551 /// location has to be 16-byte aligned.
1552 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_load_pd(double const * __dp)1553 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
1554 return *(const __m128d *)__dp;
1555 }
1556
1557 /// Loads a double-precision floating-point value from a specified memory
1558 /// location and duplicates it to both vector elements of a 128-bit vector of
1559 /// [2 x double].
1560 ///
1561 /// \headerfile <x86intrin.h>
1562 ///
1563 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
1564 ///
1565 /// \param __dp
1566 /// A pointer to a memory location containing a double-precision value.
1567 /// \returns A 128-bit vector of [2 x double] containing the loaded and
1568 /// duplicated values.
_mm_load1_pd(double const * __dp)1569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
1570 struct __mm_load1_pd_struct {
1571 double __u;
1572 } __attribute__((__packed__, __may_alias__));
1573 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
1574 return __extension__(__m128d){__u, __u};
1575 }
1576
1577 #define _mm_load_pd1(dp) _mm_load1_pd(dp)
1578
1579 /// Loads two double-precision values, in reverse order, from an aligned
1580 /// memory location into a 128-bit vector of [2 x double].
1581 ///
1582 /// \headerfile <x86intrin.h>
1583 ///
1584 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
1585 /// needed shuffling instructions. In AVX mode, the shuffling may be combined
1586 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
1587 ///
1588 /// \param __dp
1589 /// A 16-byte aligned pointer to an array of double-precision values to be
1590 /// loaded in reverse order.
1591 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded
1592 /// values.
_mm_loadr_pd(double const * __dp)1593 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
1594 __m128d __u = *(const __m128d *)__dp;
1595 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
1596 }
1597
1598 /// Loads a 128-bit floating-point vector of [2 x double] from an
1599 /// unaligned memory location.
1600 ///
1601 /// \headerfile <x86intrin.h>
1602 ///
1603 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1604 ///
1605 /// \param __dp
1606 /// A pointer to a 128-bit memory location. The address of the memory
1607 /// location does not have to be aligned.
1608 /// \returns A 128-bit vector of [2 x double] containing the loaded values.
_mm_loadu_pd(double const * __dp)1609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
1610 struct __loadu_pd {
1611 __m128d_u __v;
1612 } __attribute__((__packed__, __may_alias__));
1613 return ((const struct __loadu_pd *)__dp)->__v;
1614 }
1615
1616 /// Loads a 64-bit integer value to the low element of a 128-bit integer
1617 /// vector and clears the upper element.
1618 ///
1619 /// \headerfile <x86intrin.h>
1620 ///
1621 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1622 ///
1623 /// \param __a
1624 /// A pointer to a 64-bit memory location. The address of the memory
1625 /// location does not have to be aligned.
1626 /// \returns A 128-bit vector of [2 x i64] containing the loaded value.
_mm_loadu_si64(void const * __a)1627 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
1628 struct __loadu_si64 {
1629 long long __v;
1630 } __attribute__((__packed__, __may_alias__));
1631 long long __u = ((const struct __loadu_si64 *)__a)->__v;
1632 return __extension__(__m128i)(__v2di){__u, 0LL};
1633 }
1634
1635 /// Loads a 32-bit integer value to the low element of a 128-bit integer
1636 /// vector and clears the upper element.
1637 ///
1638 /// \headerfile <x86intrin.h>
1639 ///
1640 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
1641 ///
1642 /// \param __a
1643 /// A pointer to a 32-bit memory location. The address of the memory
1644 /// location does not have to be aligned.
1645 /// \returns A 128-bit vector of [4 x i32] containing the loaded value.
_mm_loadu_si32(void const * __a)1646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
1647 struct __loadu_si32 {
1648 int __v;
1649 } __attribute__((__packed__, __may_alias__));
1650 int __u = ((const struct __loadu_si32 *)__a)->__v;
1651 return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
1652 }
1653
1654 /// Loads a 16-bit integer value to the low element of a 128-bit integer
1655 /// vector and clears the upper element.
1656 ///
1657 /// \headerfile <x86intrin.h>
1658 ///
1659 /// This intrinsic does not correspond to a specific instruction.
1660 ///
1661 /// \param __a
1662 /// A pointer to a 16-bit memory location. The address of the memory
1663 /// location does not have to be aligned.
1664 /// \returns A 128-bit vector of [8 x i16] containing the loaded value.
_mm_loadu_si16(void const * __a)1665 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
1666 struct __loadu_si16 {
1667 short __v;
1668 } __attribute__((__packed__, __may_alias__));
1669 short __u = ((const struct __loadu_si16 *)__a)->__v;
1670 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
1671 }
1672
1673 /// Loads a 64-bit double-precision value to the low element of a
1674 /// 128-bit integer vector and clears the upper element.
1675 ///
1676 /// \headerfile <x86intrin.h>
1677 ///
1678 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1679 ///
1680 /// \param __dp
1681 /// A pointer to a memory location containing a double-precision value.
1682 /// The address of the memory location does not have to be aligned.
1683 /// \returns A 128-bit vector of [2 x double] containing the loaded value.
_mm_load_sd(double const * __dp)1684 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
1685 struct __mm_load_sd_struct {
1686 double __u;
1687 } __attribute__((__packed__, __may_alias__));
1688 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
1689 return __extension__(__m128d){__u, 0};
1690 }
1691
1692 /// Loads a double-precision value into the high-order bits of a 128-bit
1693 /// vector of [2 x double]. The low-order bits are copied from the low-order
1694 /// bits of the first operand.
1695 ///
1696 /// \headerfile <x86intrin.h>
1697 ///
1698 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
1699 ///
1700 /// \param __a
1701 /// A 128-bit vector of [2 x double]. \n
1702 /// Bits [63:0] are written to bits [63:0] of the result.
1703 /// \param __dp
1704 /// A pointer to a 64-bit memory location containing a double-precision
1705 /// floating-point value that is loaded. The loaded value is written to bits
1706 /// [127:64] of the result. The address of the memory location does not have
1707 /// to be aligned.
1708 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadh_pd(__m128d __a,double const * __dp)1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
1710 double const *__dp) {
1711 struct __mm_loadh_pd_struct {
1712 double __u;
1713 } __attribute__((__packed__, __may_alias__));
1714 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
1715 return __extension__(__m128d){__a[0], __u};
1716 }
1717
1718 /// Loads a double-precision value into the low-order bits of a 128-bit
1719 /// vector of [2 x double]. The high-order bits are copied from the
1720 /// high-order bits of the first operand.
1721 ///
1722 /// \headerfile <x86intrin.h>
1723 ///
1724 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
1725 ///
1726 /// \param __a
1727 /// A 128-bit vector of [2 x double]. \n
1728 /// Bits [127:64] are written to bits [127:64] of the result.
1729 /// \param __dp
1730 /// A pointer to a 64-bit memory location containing a double-precision
1731 /// floating-point value that is loaded. The loaded value is written to bits
1732 /// [63:0] of the result. The address of the memory location does not have to
1733 /// be aligned.
1734 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_loadl_pd(__m128d __a,double const * __dp)1735 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
1736 double const *__dp) {
1737 struct __mm_loadl_pd_struct {
1738 double __u;
1739 } __attribute__((__packed__, __may_alias__));
1740 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
1741 return __extension__(__m128d){__u, __a[1]};
1742 }
1743
1744 /// Constructs a 128-bit floating-point vector of [2 x double] with
1745 /// unspecified content. This could be used as an argument to another
1746 /// intrinsic function where the argument is required but the value is not
1747 /// actually used.
1748 ///
1749 /// \headerfile <x86intrin.h>
1750 ///
1751 /// This intrinsic has no corresponding instruction.
1752 ///
1753 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified
1754 /// content.
_mm_undefined_pd(void)1755 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
1756 return (__m128d)__builtin_ia32_undef128();
1757 }
1758
1759 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1760 /// 64 bits of the vector are initialized with the specified double-precision
1761 /// floating-point value. The upper 64 bits are set to zero.
1762 ///
1763 /// \headerfile <x86intrin.h>
1764 ///
1765 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
1766 ///
1767 /// \param __w
1768 /// A double-precision floating-point value used to initialize the lower 64
1769 /// bits of the result.
1770 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The
1771 /// lower 64 bits contain the value of the parameter. The upper 64 bits are
1772 /// set to zero.
_mm_set_sd(double __w)1773 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
1774 return __extension__(__m128d){__w, 0.0};
1775 }
1776
1777 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1778 /// of the two double-precision floating-point vector elements set to the
1779 /// specified double-precision floating-point value.
1780 ///
1781 /// \headerfile <x86intrin.h>
1782 ///
1783 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1784 ///
1785 /// \param __w
1786 /// A double-precision floating-point value used to initialize each vector
1787 /// element of the result.
1788 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set1_pd(double __w)1789 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
1790 return __extension__(__m128d){__w, __w};
1791 }
1792
1793 /// Constructs a 128-bit floating-point vector of [2 x double], with each
1794 /// of the two double-precision floating-point vector elements set to the
1795 /// specified double-precision floating-point value.
1796 ///
1797 /// \headerfile <x86intrin.h>
1798 ///
1799 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
1800 ///
1801 /// \param __w
1802 /// A double-precision floating-point value used to initialize each vector
1803 /// element of the result.
1804 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd1(double __w)1805 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
1806 return _mm_set1_pd(__w);
1807 }
1808
1809 /// Constructs a 128-bit floating-point vector of [2 x double]
1810 /// initialized with the specified double-precision floating-point values.
1811 ///
1812 /// \headerfile <x86intrin.h>
1813 ///
1814 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1815 ///
1816 /// \param __w
1817 /// A double-precision floating-point value used to initialize the upper 64
1818 /// bits of the result.
1819 /// \param __x
1820 /// A double-precision floating-point value used to initialize the lower 64
1821 /// bits of the result.
1822 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_set_pd(double __w,double __x)1823 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
1824 double __x) {
1825 return __extension__(__m128d){__x, __w};
1826 }
1827
1828 /// Constructs a 128-bit floating-point vector of [2 x double],
1829 /// initialized in reverse order with the specified double-precision
1830 /// floating-point values.
1831 ///
1832 /// \headerfile <x86intrin.h>
1833 ///
1834 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
1835 ///
1836 /// \param __w
1837 /// A double-precision floating-point value used to initialize the lower 64
1838 /// bits of the result.
1839 /// \param __x
1840 /// A double-precision floating-point value used to initialize the upper 64
1841 /// bits of the result.
1842 /// \returns An initialized 128-bit floating-point vector of [2 x double].
_mm_setr_pd(double __w,double __x)1843 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
1844 double __x) {
1845 return __extension__(__m128d){__w, __x};
1846 }
1847
1848 /// Constructs a 128-bit floating-point vector of [2 x double]
1849 /// initialized to zero.
1850 ///
1851 /// \headerfile <x86intrin.h>
1852 ///
1853 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
1854 ///
1855 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
1856 /// all elements set to zero.
_mm_setzero_pd(void)1857 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
1858 return __extension__(__m128d){0.0, 0.0};
1859 }
1860
1861 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
1862 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
1863 /// 64 bits are set to the upper 64 bits of the first parameter.
1864 ///
1865 /// \headerfile <x86intrin.h>
1866 ///
1867 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
1868 ///
1869 /// \param __a
1870 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
1871 /// upper 64 bits of the result.
1872 /// \param __b
1873 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
1874 /// lower 64 bits of the result.
1875 /// \returns A 128-bit vector of [2 x double] containing the moved values.
_mm_move_sd(__m128d __a,__m128d __b)1876 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
1877 __m128d __b) {
1878 __a[0] = __b[0];
1879 return __a;
1880 }
1881
1882 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
1883 /// memory location.
1884 ///
1885 /// \headerfile <x86intrin.h>
1886 ///
1887 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
1888 ///
1889 /// \param __dp
1890 /// A pointer to a 64-bit memory location.
1891 /// \param __a
1892 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_store_sd(double * __dp,__m128d __a)1893 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
1894 __m128d __a) {
1895 struct __mm_store_sd_struct {
1896 double __u;
1897 } __attribute__((__packed__, __may_alias__));
1898 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
1899 }
1900
1901 /// Moves packed double-precision values from a 128-bit vector of
1902 /// [2 x double] to a memory location.
1903 ///
1904 /// \headerfile <x86intrin.h>
1905 ///
1906 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
1907 ///
1908 /// \param __dp
1909 /// A pointer to an aligned memory location that can store two
1910 /// double-precision values.
1911 /// \param __a
1912 /// A packed 128-bit vector of [2 x double] containing the values to be
1913 /// moved.
_mm_store_pd(double * __dp,__m128d __a)1914 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
1915 __m128d __a) {
1916 *(__m128d *)__dp = __a;
1917 }
1918
1919 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1920 /// the upper and lower 64 bits of a memory location.
1921 ///
1922 /// \headerfile <x86intrin.h>
1923 ///
1924 /// This intrinsic corresponds to the
1925 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1926 ///
1927 /// \param __dp
1928 /// A pointer to a memory location that can store two double-precision
1929 /// values.
1930 /// \param __a
1931 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1932 /// of the values in \a __dp.
_mm_store1_pd(double * __dp,__m128d __a)1933 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
1934 __m128d __a) {
1935 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
1936 _mm_store_pd(__dp, __a);
1937 }
1938
1939 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
1940 /// the upper and lower 64 bits of a memory location.
1941 ///
1942 /// \headerfile <x86intrin.h>
1943 ///
1944 /// This intrinsic corresponds to the
1945 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
1946 ///
1947 /// \param __dp
1948 /// A pointer to a memory location that can store two double-precision
1949 /// values.
1950 /// \param __a
1951 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
1952 /// of the values in \a __dp.
_mm_store_pd1(double * __dp,__m128d __a)1953 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
1954 __m128d __a) {
1955 _mm_store1_pd(__dp, __a);
1956 }
1957
1958 /// Stores a 128-bit vector of [2 x double] into an unaligned memory
1959 /// location.
1960 ///
1961 /// \headerfile <x86intrin.h>
1962 ///
1963 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
1964 ///
1965 /// \param __dp
1966 /// A pointer to a 128-bit memory location. The address of the memory
1967 /// location does not have to be aligned.
1968 /// \param __a
1969 /// A 128-bit vector of [2 x double] containing the values to be stored.
_mm_storeu_pd(double * __dp,__m128d __a)1970 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
1971 __m128d __a) {
1972 struct __storeu_pd {
1973 __m128d_u __v;
1974 } __attribute__((__packed__, __may_alias__));
1975 ((struct __storeu_pd *)__dp)->__v = __a;
1976 }
1977
1978 /// Stores two double-precision values, in reverse order, from a 128-bit
1979 /// vector of [2 x double] to a 16-byte aligned memory location.
1980 ///
1981 /// \headerfile <x86intrin.h>
1982 ///
1983 /// This intrinsic corresponds to a shuffling instruction followed by a
1984 /// <c> VMOVAPD / MOVAPD </c> instruction.
1985 ///
1986 /// \param __dp
1987 /// A pointer to a 16-byte aligned memory location that can store two
1988 /// double-precision values.
1989 /// \param __a
1990 /// A 128-bit vector of [2 x double] containing the values to be reversed and
1991 /// stored.
_mm_storer_pd(double * __dp,__m128d __a)1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
1993 __m128d __a) {
1994 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
1995 *(__m128d *)__dp = __a;
1996 }
1997
1998 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
1999 /// memory location.
2000 ///
2001 /// \headerfile <x86intrin.h>
2002 ///
2003 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
2004 ///
2005 /// \param __dp
2006 /// A pointer to a 64-bit memory location.
2007 /// \param __a
2008 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storeh_pd(double * __dp,__m128d __a)2009 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
2010 __m128d __a) {
2011 struct __mm_storeh_pd_struct {
2012 double __u;
2013 } __attribute__((__packed__, __may_alias__));
2014 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
2015 }
2016
2017 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
2018 /// memory location.
2019 ///
2020 /// \headerfile <x86intrin.h>
2021 ///
2022 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
2023 ///
2024 /// \param __dp
2025 /// A pointer to a 64-bit memory location.
2026 /// \param __a
2027 /// A 128-bit vector of [2 x double] containing the value to be stored.
_mm_storel_pd(double * __dp,__m128d __a)2028 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
2029 __m128d __a) {
2030 struct __mm_storeh_pd_struct {
2031 double __u;
2032 } __attribute__((__packed__, __may_alias__));
2033 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
2034 }
2035
2036 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
2037 /// saving the lower 8 bits of each sum in the corresponding element of a
2038 /// 128-bit result vector of [16 x i8].
2039 ///
2040 /// The integer elements of both parameters can be either signed or unsigned.
2041 ///
2042 /// \headerfile <x86intrin.h>
2043 ///
2044 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
2045 ///
2046 /// \param __a
2047 /// A 128-bit vector of [16 x i8].
2048 /// \param __b
2049 /// A 128-bit vector of [16 x i8].
2050 /// \returns A 128-bit vector of [16 x i8] containing the sums of both
2051 /// parameters.
_mm_add_epi8(__m128i __a,__m128i __b)2052 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
2053 __m128i __b) {
2054 return (__m128i)((__v16qu)__a + (__v16qu)__b);
2055 }
2056
2057 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
2058 /// saving the lower 16 bits of each sum in the corresponding element of a
2059 /// 128-bit result vector of [8 x i16].
2060 ///
2061 /// The integer elements of both parameters can be either signed or unsigned.
2062 ///
2063 /// \headerfile <x86intrin.h>
2064 ///
2065 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
2066 ///
2067 /// \param __a
2068 /// A 128-bit vector of [8 x i16].
2069 /// \param __b
2070 /// A 128-bit vector of [8 x i16].
2071 /// \returns A 128-bit vector of [8 x i16] containing the sums of both
2072 /// parameters.
_mm_add_epi16(__m128i __a,__m128i __b)2073 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
2074 __m128i __b) {
2075 return (__m128i)((__v8hu)__a + (__v8hu)__b);
2076 }
2077
2078 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
2079 /// saving the lower 32 bits of each sum in the corresponding element of a
2080 /// 128-bit result vector of [4 x i32].
2081 ///
2082 /// The integer elements of both parameters can be either signed or unsigned.
2083 ///
2084 /// \headerfile <x86intrin.h>
2085 ///
2086 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
2087 ///
2088 /// \param __a
2089 /// A 128-bit vector of [4 x i32].
2090 /// \param __b
2091 /// A 128-bit vector of [4 x i32].
2092 /// \returns A 128-bit vector of [4 x i32] containing the sums of both
2093 /// parameters.
_mm_add_epi32(__m128i __a,__m128i __b)2094 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
2095 __m128i __b) {
2096 return (__m128i)((__v4su)__a + (__v4su)__b);
2097 }
2098
2099 /// Adds two signed or unsigned 64-bit integer values, returning the
2100 /// lower 64 bits of the sum.
2101 ///
2102 /// \headerfile <x86intrin.h>
2103 ///
2104 /// This intrinsic corresponds to the <c> PADDQ </c> instruction.
2105 ///
2106 /// \param __a
2107 /// A 64-bit integer.
2108 /// \param __b
2109 /// A 64-bit integer.
2110 /// \returns A 64-bit integer containing the sum of both parameters.
_mm_add_si64(__m64 __a,__m64 __b)2111 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
2112 __m64 __b) {
2113 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
2114 }
2115
2116 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
2117 /// saving the lower 64 bits of each sum in the corresponding element of a
2118 /// 128-bit result vector of [2 x i64].
2119 ///
2120 /// The integer elements of both parameters can be either signed or unsigned.
2121 ///
2122 /// \headerfile <x86intrin.h>
2123 ///
2124 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
2125 ///
2126 /// \param __a
2127 /// A 128-bit vector of [2 x i64].
2128 /// \param __b
2129 /// A 128-bit vector of [2 x i64].
2130 /// \returns A 128-bit vector of [2 x i64] containing the sums of both
2131 /// parameters.
_mm_add_epi64(__m128i __a,__m128i __b)2132 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
2133 __m128i __b) {
2134 return (__m128i)((__v2du)__a + (__v2du)__b);
2135 }
2136
2137 /// Adds, with saturation, the corresponding elements of two 128-bit
2138 /// signed [16 x i8] vectors, saving each sum in the corresponding element
2139 /// of a 128-bit result vector of [16 x i8].
2140 ///
2141 /// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums
2142 /// less than 0x80 are saturated to 0x80.
2143 ///
2144 /// \headerfile <x86intrin.h>
2145 ///
2146 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
2147 ///
2148 /// \param __a
2149 /// A 128-bit signed [16 x i8] vector.
2150 /// \param __b
2151 /// A 128-bit signed [16 x i8] vector.
2152 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
2153 /// both parameters.
_mm_adds_epi8(__m128i __a,__m128i __b)2154 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
2155 __m128i __b) {
2156 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
2157 }
2158
2159 /// Adds, with saturation, the corresponding elements of two 128-bit
2160 /// signed [8 x i16] vectors, saving each sum in the corresponding element
2161 /// of a 128-bit result vector of [8 x i16].
2162 ///
2163 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums
2164 /// less than 0x8000 are saturated to 0x8000.
2165 ///
2166 /// \headerfile <x86intrin.h>
2167 ///
2168 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
2169 ///
2170 /// \param __a
2171 /// A 128-bit signed [8 x i16] vector.
2172 /// \param __b
2173 /// A 128-bit signed [8 x i16] vector.
2174 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
2175 /// both parameters.
_mm_adds_epi16(__m128i __a,__m128i __b)2176 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
2177 __m128i __b) {
2178 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
2179 }
2180
2181 /// Adds, with saturation, the corresponding elements of two 128-bit
2182 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element
2183 /// of a 128-bit result vector of [16 x i8].
2184 ///
2185 /// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are
2186 /// saturated to 0x00.
2187 ///
2188 /// \headerfile <x86intrin.h>
2189 ///
2190 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2191 ///
2192 /// \param __a
2193 /// A 128-bit unsigned [16 x i8] vector.
2194 /// \param __b
2195 /// A 128-bit unsigned [16 x i8] vector.
2196 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
2197 /// of both parameters.
_mm_adds_epu8(__m128i __a,__m128i __b)2198 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
2199 __m128i __b) {
2200 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
2201 }
2202
2203 /// Adds, with saturation, the corresponding elements of two 128-bit
2204 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element
2205 /// of a 128-bit result vector of [8 x i16].
2206 ///
2207 /// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums
2208 /// are saturated to 0x0000.
2209 ///
2210 /// \headerfile <x86intrin.h>
2211 ///
2212 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
2213 ///
2214 /// \param __a
2215 /// A 128-bit unsigned [8 x i16] vector.
2216 /// \param __b
2217 /// A 128-bit unsigned [8 x i16] vector.
2218 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
2219 /// of both parameters.
_mm_adds_epu16(__m128i __a,__m128i __b)2220 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
2221 __m128i __b) {
2222 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
2223 }
2224
2225 /// Computes the rounded averages of corresponding elements of two
2226 /// 128-bit unsigned [16 x i8] vectors, saving each result in the
2227 /// corresponding element of a 128-bit result vector of [16 x i8].
2228 ///
2229 /// \headerfile <x86intrin.h>
2230 ///
2231 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
2232 ///
2233 /// \param __a
2234 /// A 128-bit unsigned [16 x i8] vector.
2235 /// \param __b
2236 /// A 128-bit unsigned [16 x i8] vector.
2237 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
2238 /// averages of both parameters.
_mm_avg_epu8(__m128i __a,__m128i __b)2239 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
2240 __m128i __b) {
2241 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
2242 }
2243
2244 /// Computes the rounded averages of corresponding elements of two
2245 /// 128-bit unsigned [8 x i16] vectors, saving each result in the
2246 /// corresponding element of a 128-bit result vector of [8 x i16].
2247 ///
2248 /// \headerfile <x86intrin.h>
2249 ///
2250 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
2251 ///
2252 /// \param __a
2253 /// A 128-bit unsigned [8 x i16] vector.
2254 /// \param __b
2255 /// A 128-bit unsigned [8 x i16] vector.
2256 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
2257 /// averages of both parameters.
_mm_avg_epu16(__m128i __a,__m128i __b)2258 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
2259 __m128i __b) {
2260 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
2261 }
2262
2263 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
2264 /// vectors, producing eight intermediate 32-bit signed integer products, and
2265 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed
2266 /// [4 x i32] vector.
2267 ///
2268 /// For example, bits [15:0] of both parameters are multiplied producing a
2269 /// 32-bit product, bits [31:16] of both parameters are multiplied producing
2270 /// a 32-bit product, and the sum of those two products becomes bits [31:0]
2271 /// of the result.
2272 ///
2273 /// \headerfile <x86intrin.h>
2274 ///
2275 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
2276 ///
2277 /// \param __a
2278 /// A 128-bit signed [8 x i16] vector.
2279 /// \param __b
2280 /// A 128-bit signed [8 x i16] vector.
2281 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products
2282 /// of both parameters.
_mm_madd_epi16(__m128i __a,__m128i __b)2283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
2284 __m128i __b) {
2285 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
2286 }
2287
2288 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2289 /// vectors, saving the greater value from each comparison in the
2290 /// corresponding element of a 128-bit result vector of [8 x i16].
2291 ///
2292 /// \headerfile <x86intrin.h>
2293 ///
2294 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
2295 ///
2296 /// \param __a
2297 /// A 128-bit signed [8 x i16] vector.
2298 /// \param __b
2299 /// A 128-bit signed [8 x i16] vector.
2300 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of
2301 /// each comparison.
_mm_max_epi16(__m128i __a,__m128i __b)2302 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
2303 __m128i __b) {
2304 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
2305 }
2306
2307 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2308 /// vectors, saving the greater value from each comparison in the
2309 /// corresponding element of a 128-bit result vector of [16 x i8].
2310 ///
2311 /// \headerfile <x86intrin.h>
2312 ///
2313 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
2314 ///
2315 /// \param __a
2316 /// A 128-bit unsigned [16 x i8] vector.
2317 /// \param __b
2318 /// A 128-bit unsigned [16 x i8] vector.
2319 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
2320 /// each comparison.
_mm_max_epu8(__m128i __a,__m128i __b)2321 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
2322 __m128i __b) {
2323 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
2324 }
2325
2326 /// Compares corresponding elements of two 128-bit signed [8 x i16]
2327 /// vectors, saving the smaller value from each comparison in the
2328 /// corresponding element of a 128-bit result vector of [8 x i16].
2329 ///
2330 /// \headerfile <x86intrin.h>
2331 ///
2332 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
2333 ///
2334 /// \param __a
2335 /// A 128-bit signed [8 x i16] vector.
2336 /// \param __b
2337 /// A 128-bit signed [8 x i16] vector.
2338 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
2339 /// each comparison.
_mm_min_epi16(__m128i __a,__m128i __b)2340 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
2341 __m128i __b) {
2342 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
2343 }
2344
2345 /// Compares corresponding elements of two 128-bit unsigned [16 x i8]
2346 /// vectors, saving the smaller value from each comparison in the
2347 /// corresponding element of a 128-bit result vector of [16 x i8].
2348 ///
2349 /// \headerfile <x86intrin.h>
2350 ///
2351 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
2352 ///
2353 /// \param __a
2354 /// A 128-bit unsigned [16 x i8] vector.
2355 /// \param __b
2356 /// A 128-bit unsigned [16 x i8] vector.
2357 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
2358 /// each comparison.
_mm_min_epu8(__m128i __a,__m128i __b)2359 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
2360 __m128i __b) {
2361 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
2362 }
2363
2364 /// Multiplies the corresponding elements of two signed [8 x i16]
2365 /// vectors, saving the upper 16 bits of each 32-bit product in the
2366 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2367 ///
2368 /// \headerfile <x86intrin.h>
2369 ///
2370 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
2371 ///
2372 /// \param __a
2373 /// A 128-bit signed [8 x i16] vector.
2374 /// \param __b
2375 /// A 128-bit signed [8 x i16] vector.
2376 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
2377 /// each of the eight 32-bit products.
_mm_mulhi_epi16(__m128i __a,__m128i __b)2378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
2379 __m128i __b) {
2380 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
2381 }
2382
2383 /// Multiplies the corresponding elements of two unsigned [8 x i16]
2384 /// vectors, saving the upper 16 bits of each 32-bit product in the
2385 /// corresponding element of a 128-bit unsigned [8 x i16] result vector.
2386 ///
2387 /// \headerfile <x86intrin.h>
2388 ///
2389 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
2390 ///
2391 /// \param __a
2392 /// A 128-bit unsigned [8 x i16] vector.
2393 /// \param __b
2394 /// A 128-bit unsigned [8 x i16] vector.
2395 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
2396 /// of each of the eight 32-bit products.
_mm_mulhi_epu16(__m128i __a,__m128i __b)2397 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
2398 __m128i __b) {
2399 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
2400 }
2401
2402 /// Multiplies the corresponding elements of two signed [8 x i16]
2403 /// vectors, saving the lower 16 bits of each 32-bit product in the
2404 /// corresponding element of a 128-bit signed [8 x i16] result vector.
2405 ///
2406 /// \headerfile <x86intrin.h>
2407 ///
2408 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
2409 ///
2410 /// \param __a
2411 /// A 128-bit signed [8 x i16] vector.
2412 /// \param __b
2413 /// A 128-bit signed [8 x i16] vector.
2414 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
2415 /// each of the eight 32-bit products.
_mm_mullo_epi16(__m128i __a,__m128i __b)2416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
2417 __m128i __b) {
2418 return (__m128i)((__v8hu)__a * (__v8hu)__b);
2419 }
2420
2421 /// Multiplies 32-bit unsigned integer values contained in the lower bits
2422 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
2423 /// product.
2424 ///
2425 /// \headerfile <x86intrin.h>
2426 ///
2427 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
2428 ///
2429 /// \param __a
2430 /// A 64-bit integer containing one of the source operands.
2431 /// \param __b
2432 /// A 64-bit integer containing one of the source operands.
2433 /// \returns A 64-bit integer vector containing the product of both operands.
_mm_mul_su32(__m64 __a,__m64 __b)2434 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
2435 __m64 __b) {
2436 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
2437 }
2438
2439 /// Multiplies 32-bit unsigned integer values contained in the lower
2440 /// bits of the corresponding elements of two [2 x i64] vectors, and returns
2441 /// the 64-bit products in the corresponding elements of a [2 x i64] vector.
2442 ///
2443 /// \headerfile <x86intrin.h>
2444 ///
2445 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
2446 ///
2447 /// \param __a
2448 /// A [2 x i64] vector containing one of the source operands.
2449 /// \param __b
2450 /// A [2 x i64] vector containing one of the source operands.
2451 /// \returns A [2 x i64] vector containing the product of both operands.
_mm_mul_epu32(__m128i __a,__m128i __b)2452 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
2453 __m128i __b) {
2454 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
2455 }
2456
2457 /// Computes the absolute differences of corresponding 8-bit integer
2458 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and
2459 /// separately sums the second 8 absolute differences. Packs these two
2460 /// unsigned 16-bit integer sums into the upper and lower elements of a
2461 /// [2 x i64] vector.
2462 ///
2463 /// \headerfile <x86intrin.h>
2464 ///
2465 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
2466 ///
2467 /// \param __a
2468 /// A 128-bit integer vector containing one of the source operands.
2469 /// \param __b
2470 /// A 128-bit integer vector containing one of the source operands.
2471 /// \returns A [2 x i64] vector containing the sums of the sets of absolute
2472 /// differences between both operands.
_mm_sad_epu8(__m128i __a,__m128i __b)2473 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
2474 __m128i __b) {
2475 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
2476 }
2477
2478 /// Subtracts the corresponding 8-bit integer values in the operands.
2479 ///
2480 /// \headerfile <x86intrin.h>
2481 ///
2482 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
2483 ///
2484 /// \param __a
2485 /// A 128-bit integer vector containing the minuends.
2486 /// \param __b
2487 /// A 128-bit integer vector containing the subtrahends.
2488 /// \returns A 128-bit integer vector containing the differences of the values
2489 /// in the operands.
_mm_sub_epi8(__m128i __a,__m128i __b)2490 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
2491 __m128i __b) {
2492 return (__m128i)((__v16qu)__a - (__v16qu)__b);
2493 }
2494
2495 /// Subtracts the corresponding 16-bit integer values in the operands.
2496 ///
2497 /// \headerfile <x86intrin.h>
2498 ///
2499 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
2500 ///
2501 /// \param __a
2502 /// A 128-bit integer vector containing the minuends.
2503 /// \param __b
2504 /// A 128-bit integer vector containing the subtrahends.
2505 /// \returns A 128-bit integer vector containing the differences of the values
2506 /// in the operands.
_mm_sub_epi16(__m128i __a,__m128i __b)2507 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
2508 __m128i __b) {
2509 return (__m128i)((__v8hu)__a - (__v8hu)__b);
2510 }
2511
2512 /// Subtracts the corresponding 32-bit integer values in the operands.
2513 ///
2514 /// \headerfile <x86intrin.h>
2515 ///
2516 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
2517 ///
2518 /// \param __a
2519 /// A 128-bit integer vector containing the minuends.
2520 /// \param __b
2521 /// A 128-bit integer vector containing the subtrahends.
2522 /// \returns A 128-bit integer vector containing the differences of the values
2523 /// in the operands.
_mm_sub_epi32(__m128i __a,__m128i __b)2524 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
2525 __m128i __b) {
2526 return (__m128i)((__v4su)__a - (__v4su)__b);
2527 }
2528
2529 /// Subtracts signed or unsigned 64-bit integer values and writes the
2530 /// difference to the corresponding bits in the destination.
2531 ///
2532 /// \headerfile <x86intrin.h>
2533 ///
2534 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
2535 ///
2536 /// \param __a
2537 /// A 64-bit integer vector containing the minuend.
2538 /// \param __b
2539 /// A 64-bit integer vector containing the subtrahend.
2540 /// \returns A 64-bit integer vector containing the difference of the values in
2541 /// the operands.
_mm_sub_si64(__m64 __a,__m64 __b)2542 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
2543 __m64 __b) {
2544 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
2545 }
2546
2547 /// Subtracts the corresponding elements of two [2 x i64] vectors.
2548 ///
2549 /// \headerfile <x86intrin.h>
2550 ///
2551 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
2552 ///
2553 /// \param __a
2554 /// A 128-bit integer vector containing the minuends.
2555 /// \param __b
2556 /// A 128-bit integer vector containing the subtrahends.
2557 /// \returns A 128-bit integer vector containing the differences of the values
2558 /// in the operands.
_mm_sub_epi64(__m128i __a,__m128i __b)2559 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
2560 __m128i __b) {
2561 return (__m128i)((__v2du)__a - (__v2du)__b);
2562 }
2563
2564 /// Subtracts, with saturation, corresponding 8-bit signed integer values in
2565 /// the input and returns the differences in the corresponding bytes in the
2566 /// destination.
2567 ///
2568 /// Differences greater than 0x7F are saturated to 0x7F, and differences
2569 /// less than 0x80 are saturated to 0x80.
2570 ///
2571 /// \headerfile <x86intrin.h>
2572 ///
2573 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
2574 ///
2575 /// \param __a
2576 /// A 128-bit integer vector containing the minuends.
2577 /// \param __b
2578 /// A 128-bit integer vector containing the subtrahends.
2579 /// \returns A 128-bit integer vector containing the differences of the values
2580 /// in the operands.
_mm_subs_epi8(__m128i __a,__m128i __b)2581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
2582 __m128i __b) {
2583 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
2584 }
2585
2586 /// Subtracts, with saturation, corresponding 16-bit signed integer values in
2587 /// the input and returns the differences in the corresponding bytes in the
2588 /// destination.
2589 ///
2590 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
2591 /// than 0x8000 are saturated to 0x8000.
2592 ///
2593 /// \headerfile <x86intrin.h>
2594 ///
2595 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
2596 ///
2597 /// \param __a
2598 /// A 128-bit integer vector containing the minuends.
2599 /// \param __b
2600 /// A 128-bit integer vector containing the subtrahends.
2601 /// \returns A 128-bit integer vector containing the differences of the values
2602 /// in the operands.
_mm_subs_epi16(__m128i __a,__m128i __b)2603 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
2604 __m128i __b) {
2605 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
2606 }
2607
2608 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in
2609 /// the input and returns the differences in the corresponding bytes in the
2610 /// destination.
2611 ///
2612 /// Differences less than 0x00 are saturated to 0x00.
2613 ///
2614 /// \headerfile <x86intrin.h>
2615 ///
2616 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
2617 ///
2618 /// \param __a
2619 /// A 128-bit integer vector containing the minuends.
2620 /// \param __b
2621 /// A 128-bit integer vector containing the subtrahends.
2622 /// \returns A 128-bit integer vector containing the unsigned integer
2623 /// differences of the values in the operands.
_mm_subs_epu8(__m128i __a,__m128i __b)2624 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
2625 __m128i __b) {
2626 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
2627 }
2628
2629 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in
2630 /// the input and returns the differences in the corresponding bytes in the
2631 /// destination.
2632 ///
2633 /// Differences less than 0x0000 are saturated to 0x0000.
2634 ///
2635 /// \headerfile <x86intrin.h>
2636 ///
2637 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
2638 ///
2639 /// \param __a
2640 /// A 128-bit integer vector containing the minuends.
2641 /// \param __b
2642 /// A 128-bit integer vector containing the subtrahends.
2643 /// \returns A 128-bit integer vector containing the unsigned integer
2644 /// differences of the values in the operands.
_mm_subs_epu16(__m128i __a,__m128i __b)2645 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
2646 __m128i __b) {
2647 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
2648 }
2649
2650 /// Performs a bitwise AND of two 128-bit integer vectors.
2651 ///
2652 /// \headerfile <x86intrin.h>
2653 ///
2654 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
2655 ///
2656 /// \param __a
2657 /// A 128-bit integer vector containing one of the source operands.
2658 /// \param __b
2659 /// A 128-bit integer vector containing one of the source operands.
2660 /// \returns A 128-bit integer vector containing the bitwise AND of the values
2661 /// in both operands.
_mm_and_si128(__m128i __a,__m128i __b)2662 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
2663 __m128i __b) {
2664 return (__m128i)((__v2du)__a & (__v2du)__b);
2665 }
2666
2667 /// Performs a bitwise AND of two 128-bit integer vectors, using the
2668 /// one's complement of the values contained in the first source operand.
2669 ///
2670 /// \headerfile <x86intrin.h>
2671 ///
2672 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
2673 ///
2674 /// \param __a
2675 /// A 128-bit vector containing the left source operand. The one's complement
2676 /// of this value is used in the bitwise AND.
2677 /// \param __b
2678 /// A 128-bit vector containing the right source operand.
2679 /// \returns A 128-bit integer vector containing the bitwise AND of the one's
2680 /// complement of the first operand and the values in the second operand.
_mm_andnot_si128(__m128i __a,__m128i __b)2681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
2682 __m128i __b) {
2683 return (__m128i)(~(__v2du)__a & (__v2du)__b);
2684 }
2685 /// Performs a bitwise OR of two 128-bit integer vectors.
2686 ///
2687 /// \headerfile <x86intrin.h>
2688 ///
2689 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
2690 ///
2691 /// \param __a
2692 /// A 128-bit integer vector containing one of the source operands.
2693 /// \param __b
2694 /// A 128-bit integer vector containing one of the source operands.
2695 /// \returns A 128-bit integer vector containing the bitwise OR of the values
2696 /// in both operands.
_mm_or_si128(__m128i __a,__m128i __b)2697 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
2698 __m128i __b) {
2699 return (__m128i)((__v2du)__a | (__v2du)__b);
2700 }
2701
2702 /// Performs a bitwise exclusive OR of two 128-bit integer vectors.
2703 ///
2704 /// \headerfile <x86intrin.h>
2705 ///
2706 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
2707 ///
2708 /// \param __a
2709 /// A 128-bit integer vector containing one of the source operands.
2710 /// \param __b
2711 /// A 128-bit integer vector containing one of the source operands.
2712 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
2713 /// values in both operands.
_mm_xor_si128(__m128i __a,__m128i __b)2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
2715 __m128i __b) {
2716 return (__m128i)((__v2du)__a ^ (__v2du)__b);
2717 }
2718
2719 /// Left-shifts the 128-bit integer vector operand by the specified
2720 /// number of bytes. Low-order bits are cleared.
2721 ///
2722 /// \headerfile <x86intrin.h>
2723 ///
2724 /// \code
2725 /// __m128i _mm_slli_si128(__m128i a, const int imm);
2726 /// \endcode
2727 ///
2728 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
2729 ///
2730 /// \param a
2731 /// A 128-bit integer vector containing the source operand.
2732 /// \param imm
2733 /// An immediate value specifying the number of bytes to left-shift operand
2734 /// \a a.
2735 /// \returns A 128-bit integer vector containing the left-shifted value.
2736 #define _mm_slli_si128(a, imm) \
2737 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2738 (int)(imm)))
2739
2740 #define _mm_bslli_si128(a, imm) \
2741 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2742 (int)(imm)))
2743
2744 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2745 /// by the specified number of bits. Low-order bits are cleared.
2746 ///
2747 /// \headerfile <x86intrin.h>
2748 ///
2749 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2750 ///
2751 /// \param __a
2752 /// A 128-bit integer vector containing the source operand.
2753 /// \param __count
2754 /// An integer value specifying the number of bits to left-shift each value
2755 /// in operand \a __a.
2756 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi16(__m128i __a,int __count)2757 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
2758 int __count) {
2759 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
2760 }
2761
2762 /// Left-shifts each 16-bit value in the 128-bit integer vector operand
2763 /// by the specified number of bits. Low-order bits are cleared.
2764 ///
2765 /// \headerfile <x86intrin.h>
2766 ///
2767 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
2768 ///
2769 /// \param __a
2770 /// A 128-bit integer vector containing the source operand.
2771 /// \param __count
2772 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2773 /// to left-shift each value in operand \a __a.
2774 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi16(__m128i __a,__m128i __count)2775 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
2776 __m128i __count) {
2777 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
2778 }
2779
2780 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2781 /// by the specified number of bits. Low-order bits are cleared.
2782 ///
2783 /// \headerfile <x86intrin.h>
2784 ///
2785 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2786 ///
2787 /// \param __a
2788 /// A 128-bit integer vector containing the source operand.
2789 /// \param __count
2790 /// An integer value specifying the number of bits to left-shift each value
2791 /// in operand \a __a.
2792 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi32(__m128i __a,int __count)2793 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
2794 int __count) {
2795 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
2796 }
2797
2798 /// Left-shifts each 32-bit value in the 128-bit integer vector operand
2799 /// by the specified number of bits. Low-order bits are cleared.
2800 ///
2801 /// \headerfile <x86intrin.h>
2802 ///
2803 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
2804 ///
2805 /// \param __a
2806 /// A 128-bit integer vector containing the source operand.
2807 /// \param __count
2808 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2809 /// to left-shift each value in operand \a __a.
2810 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi32(__m128i __a,__m128i __count)2811 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
2812 __m128i __count) {
2813 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
2814 }
2815
2816 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2817 /// by the specified number of bits. Low-order bits are cleared.
2818 ///
2819 /// \headerfile <x86intrin.h>
2820 ///
2821 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2822 ///
2823 /// \param __a
2824 /// A 128-bit integer vector containing the source operand.
2825 /// \param __count
2826 /// An integer value specifying the number of bits to left-shift each value
2827 /// in operand \a __a.
2828 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_slli_epi64(__m128i __a,int __count)2829 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
2830 int __count) {
2831 return __builtin_ia32_psllqi128((__v2di)__a, __count);
2832 }
2833
2834 /// Left-shifts each 64-bit value in the 128-bit integer vector operand
2835 /// by the specified number of bits. Low-order bits are cleared.
2836 ///
2837 /// \headerfile <x86intrin.h>
2838 ///
2839 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
2840 ///
2841 /// \param __a
2842 /// A 128-bit integer vector containing the source operand.
2843 /// \param __count
2844 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2845 /// to left-shift each value in operand \a __a.
2846 /// \returns A 128-bit integer vector containing the left-shifted values.
_mm_sll_epi64(__m128i __a,__m128i __count)2847 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
2848 __m128i __count) {
2849 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
2850 }
2851
2852 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2853 /// by the specified number of bits. High-order bits are filled with the sign
2854 /// bit of the initial value.
2855 ///
2856 /// \headerfile <x86intrin.h>
2857 ///
2858 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2859 ///
2860 /// \param __a
2861 /// A 128-bit integer vector containing the source operand.
2862 /// \param __count
2863 /// An integer value specifying the number of bits to right-shift each value
2864 /// in operand \a __a.
2865 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi16(__m128i __a,int __count)2866 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
2867 int __count) {
2868 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
2869 }
2870
2871 /// Right-shifts each 16-bit value in the 128-bit integer vector operand
2872 /// by the specified number of bits. High-order bits are filled with the sign
2873 /// bit of the initial value.
2874 ///
2875 /// \headerfile <x86intrin.h>
2876 ///
2877 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
2878 ///
2879 /// \param __a
2880 /// A 128-bit integer vector containing the source operand.
2881 /// \param __count
2882 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2883 /// to right-shift each value in operand \a __a.
2884 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi16(__m128i __a,__m128i __count)2885 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
2886 __m128i __count) {
2887 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
2888 }
2889
2890 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2891 /// by the specified number of bits. High-order bits are filled with the sign
2892 /// bit of the initial value.
2893 ///
2894 /// \headerfile <x86intrin.h>
2895 ///
2896 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2897 ///
2898 /// \param __a
2899 /// A 128-bit integer vector containing the source operand.
2900 /// \param __count
2901 /// An integer value specifying the number of bits to right-shift each value
2902 /// in operand \a __a.
2903 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srai_epi32(__m128i __a,int __count)2904 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
2905 int __count) {
2906 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
2907 }
2908
2909 /// Right-shifts each 32-bit value in the 128-bit integer vector operand
2910 /// by the specified number of bits. High-order bits are filled with the sign
2911 /// bit of the initial value.
2912 ///
2913 /// \headerfile <x86intrin.h>
2914 ///
2915 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
2916 ///
2917 /// \param __a
2918 /// A 128-bit integer vector containing the source operand.
2919 /// \param __count
2920 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2921 /// to right-shift each value in operand \a __a.
2922 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_sra_epi32(__m128i __a,__m128i __count)2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
2924 __m128i __count) {
2925 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
2926 }
2927
2928 /// Right-shifts the 128-bit integer vector operand by the specified
2929 /// number of bytes. High-order bits are cleared.
2930 ///
2931 /// \headerfile <x86intrin.h>
2932 ///
2933 /// \code
2934 /// __m128i _mm_srli_si128(__m128i a, const int imm);
2935 /// \endcode
2936 ///
2937 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
2938 ///
2939 /// \param a
2940 /// A 128-bit integer vector containing the source operand.
2941 /// \param imm
2942 /// An immediate value specifying the number of bytes to right-shift operand
2943 /// \a a.
2944 /// \returns A 128-bit integer vector containing the right-shifted value.
2945 #define _mm_srli_si128(a, imm) \
2946 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2947 (int)(imm)))
2948
2949 #define _mm_bsrli_si128(a, imm) \
2950 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2951 (int)(imm)))
2952
2953 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2954 /// operand by the specified number of bits. High-order bits are cleared.
2955 ///
2956 /// \headerfile <x86intrin.h>
2957 ///
2958 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2959 ///
2960 /// \param __a
2961 /// A 128-bit integer vector containing the source operand.
2962 /// \param __count
2963 /// An integer value specifying the number of bits to right-shift each value
2964 /// in operand \a __a.
2965 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi16(__m128i __a,int __count)2966 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
2967 int __count) {
2968 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
2969 }
2970
2971 /// Right-shifts each of 16-bit values in the 128-bit integer vector
2972 /// operand by the specified number of bits. High-order bits are cleared.
2973 ///
2974 /// \headerfile <x86intrin.h>
2975 ///
2976 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
2977 ///
2978 /// \param __a
2979 /// A 128-bit integer vector containing the source operand.
2980 /// \param __count
2981 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
2982 /// to right-shift each value in operand \a __a.
2983 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi16(__m128i __a,__m128i __count)2984 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
2985 __m128i __count) {
2986 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
2987 }
2988
2989 /// Right-shifts each of 32-bit values in the 128-bit integer vector
2990 /// operand by the specified number of bits. High-order bits are cleared.
2991 ///
2992 /// \headerfile <x86intrin.h>
2993 ///
2994 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
2995 ///
2996 /// \param __a
2997 /// A 128-bit integer vector containing the source operand.
2998 /// \param __count
2999 /// An integer value specifying the number of bits to right-shift each value
3000 /// in operand \a __a.
3001 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi32(__m128i __a,int __count)3002 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
3003 int __count) {
3004 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
3005 }
3006
3007 /// Right-shifts each of 32-bit values in the 128-bit integer vector
3008 /// operand by the specified number of bits. High-order bits are cleared.
3009 ///
3010 /// \headerfile <x86intrin.h>
3011 ///
3012 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
3013 ///
3014 /// \param __a
3015 /// A 128-bit integer vector containing the source operand.
3016 /// \param __count
3017 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3018 /// to right-shift each value in operand \a __a.
3019 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi32(__m128i __a,__m128i __count)3020 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
3021 __m128i __count) {
3022 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
3023 }
3024
3025 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3026 /// operand by the specified number of bits. High-order bits are cleared.
3027 ///
3028 /// \headerfile <x86intrin.h>
3029 ///
3030 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3031 ///
3032 /// \param __a
3033 /// A 128-bit integer vector containing the source operand.
3034 /// \param __count
3035 /// An integer value specifying the number of bits to right-shift each value
3036 /// in operand \a __a.
3037 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srli_epi64(__m128i __a,int __count)3038 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
3039 int __count) {
3040 return __builtin_ia32_psrlqi128((__v2di)__a, __count);
3041 }
3042
3043 /// Right-shifts each of 64-bit values in the 128-bit integer vector
3044 /// operand by the specified number of bits. High-order bits are cleared.
3045 ///
3046 /// \headerfile <x86intrin.h>
3047 ///
3048 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
3049 ///
3050 /// \param __a
3051 /// A 128-bit integer vector containing the source operand.
3052 /// \param __count
3053 /// A 128-bit integer vector in which bits [63:0] specify the number of bits
3054 /// to right-shift each value in operand \a __a.
3055 /// \returns A 128-bit integer vector containing the right-shifted values.
_mm_srl_epi64(__m128i __a,__m128i __count)3056 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
3057 __m128i __count) {
3058 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
3059 }
3060
3061 /// Compares each of the corresponding 8-bit values of the 128-bit
3062 /// integer vectors for equality.
3063 ///
3064 /// Each comparison returns 0x0 for false, 0xFF for true.
3065 ///
3066 /// \headerfile <x86intrin.h>
3067 ///
3068 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
3069 ///
3070 /// \param __a
3071 /// A 128-bit integer vector.
3072 /// \param __b
3073 /// A 128-bit integer vector.
3074 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi8(__m128i __a,__m128i __b)3075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
3076 __m128i __b) {
3077 return (__m128i)((__v16qi)__a == (__v16qi)__b);
3078 }
3079
3080 /// Compares each of the corresponding 16-bit values of the 128-bit
3081 /// integer vectors for equality.
3082 ///
3083 /// Each comparison returns 0x0 for false, 0xFFFF for true.
3084 ///
3085 /// \headerfile <x86intrin.h>
3086 ///
3087 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
3088 ///
3089 /// \param __a
3090 /// A 128-bit integer vector.
3091 /// \param __b
3092 /// A 128-bit integer vector.
3093 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi16(__m128i __a,__m128i __b)3094 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
3095 __m128i __b) {
3096 return (__m128i)((__v8hi)__a == (__v8hi)__b);
3097 }
3098
3099 /// Compares each of the corresponding 32-bit values of the 128-bit
3100 /// integer vectors for equality.
3101 ///
3102 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3103 ///
3104 /// \headerfile <x86intrin.h>
3105 ///
3106 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
3107 ///
3108 /// \param __a
3109 /// A 128-bit integer vector.
3110 /// \param __b
3111 /// A 128-bit integer vector.
3112 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpeq_epi32(__m128i __a,__m128i __b)3113 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
3114 __m128i __b) {
3115 return (__m128i)((__v4si)__a == (__v4si)__b);
3116 }
3117
3118 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3119 /// integer vectors to determine if the values in the first operand are
3120 /// greater than those in the second operand.
3121 ///
3122 /// Each comparison returns 0x0 for false, 0xFF for true.
3123 ///
3124 /// \headerfile <x86intrin.h>
3125 ///
3126 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3127 ///
3128 /// \param __a
3129 /// A 128-bit integer vector.
3130 /// \param __b
3131 /// A 128-bit integer vector.
3132 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi8(__m128i __a,__m128i __b)3133 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
3134 __m128i __b) {
3135 /* This function always performs a signed comparison, but __v16qi is a char
3136 which may be signed or unsigned, so use __v16qs. */
3137 return (__m128i)((__v16qs)__a > (__v16qs)__b);
3138 }
3139
3140 /// Compares each of the corresponding signed 16-bit values of the
3141 /// 128-bit integer vectors to determine if the values in the first operand
3142 /// are greater than those in the second operand.
3143 ///
3144 /// Each comparison returns 0x0 for false, 0xFFFF for true.
3145 ///
3146 /// \headerfile <x86intrin.h>
3147 ///
3148 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3149 ///
3150 /// \param __a
3151 /// A 128-bit integer vector.
3152 /// \param __b
3153 /// A 128-bit integer vector.
3154 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi16(__m128i __a,__m128i __b)3155 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
3156 __m128i __b) {
3157 return (__m128i)((__v8hi)__a > (__v8hi)__b);
3158 }
3159
3160 /// Compares each of the corresponding signed 32-bit values of the
3161 /// 128-bit integer vectors to determine if the values in the first operand
3162 /// are greater than those in the second operand.
3163 ///
3164 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3165 ///
3166 /// \headerfile <x86intrin.h>
3167 ///
3168 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3169 ///
3170 /// \param __a
3171 /// A 128-bit integer vector.
3172 /// \param __b
3173 /// A 128-bit integer vector.
3174 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmpgt_epi32(__m128i __a,__m128i __b)3175 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
3176 __m128i __b) {
3177 return (__m128i)((__v4si)__a > (__v4si)__b);
3178 }
3179
3180 /// Compares each of the corresponding signed 8-bit values of the 128-bit
3181 /// integer vectors to determine if the values in the first operand are less
3182 /// than those in the second operand.
3183 ///
3184 /// Each comparison returns 0x0 for false, 0xFF for true.
3185 ///
3186 /// \headerfile <x86intrin.h>
3187 ///
3188 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
3189 ///
3190 /// \param __a
3191 /// A 128-bit integer vector.
3192 /// \param __b
3193 /// A 128-bit integer vector.
3194 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi8(__m128i __a,__m128i __b)3195 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
3196 __m128i __b) {
3197 return _mm_cmpgt_epi8(__b, __a);
3198 }
3199
3200 /// Compares each of the corresponding signed 16-bit values of the
3201 /// 128-bit integer vectors to determine if the values in the first operand
3202 /// are less than those in the second operand.
3203 ///
3204 /// Each comparison returns 0x0 for false, 0xFFFF for true.
3205 ///
3206 /// \headerfile <x86intrin.h>
3207 ///
3208 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
3209 ///
3210 /// \param __a
3211 /// A 128-bit integer vector.
3212 /// \param __b
3213 /// A 128-bit integer vector.
3214 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi16(__m128i __a,__m128i __b)3215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
3216 __m128i __b) {
3217 return _mm_cmpgt_epi16(__b, __a);
3218 }
3219
3220 /// Compares each of the corresponding signed 32-bit values of the
3221 /// 128-bit integer vectors to determine if the values in the first operand
3222 /// are less than those in the second operand.
3223 ///
3224 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true.
3225 ///
3226 /// \headerfile <x86intrin.h>
3227 ///
3228 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
3229 ///
3230 /// \param __a
3231 /// A 128-bit integer vector.
3232 /// \param __b
3233 /// A 128-bit integer vector.
3234 /// \returns A 128-bit integer vector containing the comparison results.
_mm_cmplt_epi32(__m128i __a,__m128i __b)3235 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
3236 __m128i __b) {
3237 return _mm_cmpgt_epi32(__b, __a);
3238 }
3239
3240 #ifdef __x86_64__
3241 /// Converts a 64-bit signed integer value from the second operand into a
3242 /// double-precision value and returns it in the lower element of a [2 x
3243 /// double] vector; the upper element of the returned vector is copied from
3244 /// the upper element of the first operand.
3245 ///
3246 /// \headerfile <x86intrin.h>
3247 ///
3248 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
3249 ///
3250 /// \param __a
3251 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
3252 /// copied to the upper 64 bits of the destination.
3253 /// \param __b
3254 /// A 64-bit signed integer operand containing the value to be converted.
3255 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
3256 /// converted value of the second operand. The upper 64 bits are copied from
3257 /// the upper 64 bits of the first operand.
_mm_cvtsi64_sd(__m128d __a,long long __b)3258 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
3259 long long __b) {
3260 __a[0] = __b;
3261 return __a;
3262 }
3263
3264 /// Converts the first (lower) element of a vector of [2 x double] into a
3265 /// 64-bit signed integer value.
3266 ///
3267 /// If the converted value does not fit in a 64-bit integer, raises a
3268 /// floating-point invalid exception. If the exception is masked, returns
3269 /// the most negative integer.
3270 ///
3271 /// \headerfile <x86intrin.h>
3272 ///
3273 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
3274 ///
3275 /// \param __a
3276 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3277 /// conversion.
3278 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvtsd_si64(__m128d __a)3279 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
3280 return __builtin_ia32_cvtsd2si64((__v2df)__a);
3281 }
3282
3283 /// Converts the first (lower) element of a vector of [2 x double] into a
3284 /// 64-bit signed truncated (rounded toward zero) integer value.
3285 ///
3286 /// If a converted value does not fit in a 64-bit integer, raises a
3287 /// floating-point invalid exception. If the exception is masked, returns
3288 /// the most negative integer.
3289 ///
3290 /// \headerfile <x86intrin.h>
3291 ///
3292 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
3293 /// instruction.
3294 ///
3295 /// \param __a
3296 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
3297 /// conversion.
3298 /// \returns A 64-bit signed integer containing the converted value.
_mm_cvttsd_si64(__m128d __a)3299 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
3300 return __builtin_ia32_cvttsd2si64((__v2df)__a);
3301 }
3302 #endif
3303
3304 /// Converts a vector of [4 x i32] into a vector of [4 x float].
3305 ///
3306 /// \headerfile <x86intrin.h>
3307 ///
3308 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
3309 ///
3310 /// \param __a
3311 /// A 128-bit integer vector.
3312 /// \returns A 128-bit vector of [4 x float] containing the converted values.
_mm_cvtepi32_ps(__m128i __a)3313 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
3314 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
3315 }
3316
3317 /// Converts a vector of [4 x float] into a vector of [4 x i32].
3318 ///
3319 /// If a converted value does not fit in a 32-bit integer, raises a
3320 /// floating-point invalid exception. If the exception is masked, returns
3321 /// the most negative integer.
3322 ///
3323 /// \headerfile <x86intrin.h>
3324 ///
3325 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
3326 ///
3327 /// \param __a
3328 /// A 128-bit vector of [4 x float].
3329 /// \returns A 128-bit integer vector of [4 x i32] containing the converted
3330 /// values.
_mm_cvtps_epi32(__m128 __a)3331 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
3332 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
3333 }
3334
3335 /// Converts a vector of [4 x float] into four signed truncated (rounded toward
3336 /// zero) 32-bit integers, returned in a vector of [4 x i32].
3337 ///
3338 /// If a converted value does not fit in a 32-bit integer, raises a
3339 /// floating-point invalid exception. If the exception is masked, returns
3340 /// the most negative integer.
3341 ///
3342 /// \headerfile <x86intrin.h>
3343 ///
3344 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
3345 /// instruction.
3346 ///
3347 /// \param __a
3348 /// A 128-bit vector of [4 x float].
3349 /// \returns A 128-bit vector of [4 x i32] containing the converted values.
_mm_cvttps_epi32(__m128 __a)3350 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
3351 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
3352 }
3353
3354 /// Returns a vector of [4 x i32] where the lowest element is the input
3355 /// operand and the remaining elements are zero.
3356 ///
3357 /// \headerfile <x86intrin.h>
3358 ///
3359 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3360 ///
3361 /// \param __a
3362 /// A 32-bit signed integer operand.
3363 /// \returns A 128-bit vector of [4 x i32].
_mm_cvtsi32_si128(int __a)3364 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
3365 return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
3366 }
3367
3368 /// Returns a vector of [2 x i64] where the lower element is the input
3369 /// operand and the upper element is zero.
3370 ///
3371 /// \headerfile <x86intrin.h>
3372 ///
3373 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
3374 /// in 64-bit mode.
3375 ///
3376 /// \param __a
3377 /// A 64-bit signed integer operand containing the value to be converted.
3378 /// \returns A 128-bit vector of [2 x i64] containing the converted value.
_mm_cvtsi64_si128(long long __a)3379 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
3380 return __extension__(__m128i)(__v2di){__a, 0};
3381 }
3382
3383 /// Moves the least significant 32 bits of a vector of [4 x i32] to a
3384 /// 32-bit signed integer value.
3385 ///
3386 /// \headerfile <x86intrin.h>
3387 ///
3388 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3389 ///
3390 /// \param __a
3391 /// A vector of [4 x i32]. The least significant 32 bits are moved to the
3392 /// destination.
3393 /// \returns A 32-bit signed integer containing the moved value.
_mm_cvtsi128_si32(__m128i __a)3394 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
3395 __v4si __b = (__v4si)__a;
3396 return __b[0];
3397 }
3398
3399 /// Moves the least significant 64 bits of a vector of [2 x i64] to a
3400 /// 64-bit signed integer value.
3401 ///
3402 /// \headerfile <x86intrin.h>
3403 ///
3404 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3405 ///
3406 /// \param __a
3407 /// A vector of [2 x i64]. The least significant 64 bits are moved to the
3408 /// destination.
3409 /// \returns A 64-bit signed integer containing the moved value.
_mm_cvtsi128_si64(__m128i __a)3410 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
3411 return __a[0];
3412 }
3413
3414 /// Moves packed integer values from an aligned 128-bit memory location
3415 /// to elements in a 128-bit integer vector.
3416 ///
3417 /// \headerfile <x86intrin.h>
3418 ///
3419 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
3420 ///
3421 /// \param __p
3422 /// An aligned pointer to a memory location containing integer values.
3423 /// \returns A 128-bit integer vector containing the moved values.
3424 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_load_si128(__m128i const * __p)3425 _mm_load_si128(__m128i const *__p) {
3426 return *__p;
3427 }
3428
3429 /// Moves packed integer values from an unaligned 128-bit memory location
3430 /// to elements in a 128-bit integer vector.
3431 ///
3432 /// \headerfile <x86intrin.h>
3433 ///
3434 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
3435 ///
3436 /// \param __p
3437 /// A pointer to a memory location containing integer values.
3438 /// \returns A 128-bit integer vector containing the moved values.
3439 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadu_si128(__m128i_u const * __p)3440 _mm_loadu_si128(__m128i_u const *__p) {
3441 struct __loadu_si128 {
3442 __m128i_u __v;
3443 } __attribute__((__packed__, __may_alias__));
3444 return ((const struct __loadu_si128 *)__p)->__v;
3445 }
3446
3447 /// Returns a vector of [2 x i64] where the lower element is taken from
3448 /// the lower element of the operand, and the upper element is zero.
3449 ///
3450 /// \headerfile <x86intrin.h>
3451 ///
3452 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3453 ///
3454 /// \param __p
3455 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
3456 /// the destination.
3457 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
3458 /// moved value. The higher order bits are cleared.
3459 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_loadl_epi64(__m128i_u const * __p)3460 _mm_loadl_epi64(__m128i_u const *__p) {
3461 struct __mm_loadl_epi64_struct {
3462 long long __u;
3463 } __attribute__((__packed__, __may_alias__));
3464 return __extension__(__m128i){
3465 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
3466 }
3467
3468 /// Generates a 128-bit vector of [4 x i32] with unspecified content.
3469 /// This could be used as an argument to another intrinsic function where the
3470 /// argument is required but the value is not actually used.
3471 ///
3472 /// \headerfile <x86intrin.h>
3473 ///
3474 /// This intrinsic has no corresponding instruction.
3475 ///
3476 /// \returns A 128-bit vector of [4 x i32] with unspecified content.
_mm_undefined_si128(void)3477 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
3478 return (__m128i)__builtin_ia32_undef128();
3479 }
3480
3481 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3482 /// the specified 64-bit integer values.
3483 ///
3484 /// \headerfile <x86intrin.h>
3485 ///
3486 /// This intrinsic is a utility function and does not correspond to a specific
3487 /// instruction.
3488 ///
3489 /// \param __q1
3490 /// A 64-bit integer value used to initialize the upper 64 bits of the
3491 /// destination vector of [2 x i64].
3492 /// \param __q0
3493 /// A 64-bit integer value used to initialize the lower 64 bits of the
3494 /// destination vector of [2 x i64].
3495 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3496 /// provided in the operands.
_mm_set_epi64x(long long __q1,long long __q0)3497 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
3498 long long __q0) {
3499 return __extension__(__m128i)(__v2di){__q0, __q1};
3500 }
3501
3502 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
3503 /// the specified 64-bit integer values.
3504 ///
3505 /// \headerfile <x86intrin.h>
3506 ///
3507 /// This intrinsic is a utility function and does not correspond to a specific
3508 /// instruction.
3509 ///
3510 /// \param __q1
3511 /// A 64-bit integer value used to initialize the upper 64 bits of the
3512 /// destination vector of [2 x i64].
3513 /// \param __q0
3514 /// A 64-bit integer value used to initialize the lower 64 bits of the
3515 /// destination vector of [2 x i64].
3516 /// \returns An initialized 128-bit vector of [2 x i64] containing the values
3517 /// provided in the operands.
_mm_set_epi64(__m64 __q1,__m64 __q0)3518 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
3519 __m64 __q0) {
3520 return _mm_set_epi64x((long long)__q1, (long long)__q0);
3521 }
3522
3523 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
3524 /// the specified 32-bit integer values.
3525 ///
3526 /// \headerfile <x86intrin.h>
3527 ///
3528 /// This intrinsic is a utility function and does not correspond to a specific
3529 /// instruction.
3530 ///
3531 /// \param __i3
3532 /// A 32-bit integer value used to initialize bits [127:96] of the
3533 /// destination vector.
3534 /// \param __i2
3535 /// A 32-bit integer value used to initialize bits [95:64] of the destination
3536 /// vector.
3537 /// \param __i1
3538 /// A 32-bit integer value used to initialize bits [63:32] of the destination
3539 /// vector.
3540 /// \param __i0
3541 /// A 32-bit integer value used to initialize bits [31:0] of the destination
3542 /// vector.
3543 /// \returns An initialized 128-bit vector of [4 x i32] containing the values
3544 /// provided in the operands.
_mm_set_epi32(int __i3,int __i2,int __i1,int __i0)3545 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
3546 int __i1, int __i0) {
3547 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
3548 }
3549
3550 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
3551 /// the specified 16-bit integer values.
3552 ///
3553 /// \headerfile <x86intrin.h>
3554 ///
3555 /// This intrinsic is a utility function and does not correspond to a specific
3556 /// instruction.
3557 ///
3558 /// \param __w7
3559 /// A 16-bit integer value used to initialize bits [127:112] of the
3560 /// destination vector.
3561 /// \param __w6
3562 /// A 16-bit integer value used to initialize bits [111:96] of the
3563 /// destination vector.
3564 /// \param __w5
3565 /// A 16-bit integer value used to initialize bits [95:80] of the destination
3566 /// vector.
3567 /// \param __w4
3568 /// A 16-bit integer value used to initialize bits [79:64] of the destination
3569 /// vector.
3570 /// \param __w3
3571 /// A 16-bit integer value used to initialize bits [63:48] of the destination
3572 /// vector.
3573 /// \param __w2
3574 /// A 16-bit integer value used to initialize bits [47:32] of the destination
3575 /// vector.
3576 /// \param __w1
3577 /// A 16-bit integer value used to initialize bits [31:16] of the destination
3578 /// vector.
3579 /// \param __w0
3580 /// A 16-bit integer value used to initialize bits [15:0] of the destination
3581 /// vector.
3582 /// \returns An initialized 128-bit vector of [8 x i16] containing the values
3583 /// provided in the operands.
3584 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi16(short __w7,short __w6,short __w5,short __w4,short __w3,short __w2,short __w1,short __w0)3585 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
3586 short __w2, short __w1, short __w0) {
3587 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
3588 __w4, __w5, __w6, __w7};
3589 }
3590
3591 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
3592 /// the specified 8-bit integer values.
3593 ///
3594 /// \headerfile <x86intrin.h>
3595 ///
3596 /// This intrinsic is a utility function and does not correspond to a specific
3597 /// instruction.
3598 ///
3599 /// \param __b15
3600 /// Initializes bits [127:120] of the destination vector.
3601 /// \param __b14
3602 /// Initializes bits [119:112] of the destination vector.
3603 /// \param __b13
3604 /// Initializes bits [111:104] of the destination vector.
3605 /// \param __b12
3606 /// Initializes bits [103:96] of the destination vector.
3607 /// \param __b11
3608 /// Initializes bits [95:88] of the destination vector.
3609 /// \param __b10
3610 /// Initializes bits [87:80] of the destination vector.
3611 /// \param __b9
3612 /// Initializes bits [79:72] of the destination vector.
3613 /// \param __b8
3614 /// Initializes bits [71:64] of the destination vector.
3615 /// \param __b7
3616 /// Initializes bits [63:56] of the destination vector.
3617 /// \param __b6
3618 /// Initializes bits [55:48] of the destination vector.
3619 /// \param __b5
3620 /// Initializes bits [47:40] of the destination vector.
3621 /// \param __b4
3622 /// Initializes bits [39:32] of the destination vector.
3623 /// \param __b3
3624 /// Initializes bits [31:24] of the destination vector.
3625 /// \param __b2
3626 /// Initializes bits [23:16] of the destination vector.
3627 /// \param __b1
3628 /// Initializes bits [15:8] of the destination vector.
3629 /// \param __b0
3630 /// Initializes bits [7:0] of the destination vector.
3631 /// \returns An initialized 128-bit vector of [16 x i8] containing the values
3632 /// provided in the operands.
3633 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_set_epi8(char __b15,char __b14,char __b13,char __b12,char __b11,char __b10,char __b9,char __b8,char __b7,char __b6,char __b5,char __b4,char __b3,char __b2,char __b1,char __b0)3634 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
3635 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
3636 char __b4, char __b3, char __b2, char __b1, char __b0) {
3637 return __extension__(__m128i)(__v16qi){
3638 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
3639 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
3640 }
3641
3642 /// Initializes both values in a 128-bit integer vector with the
3643 /// specified 64-bit integer value.
3644 ///
3645 /// \headerfile <x86intrin.h>
3646 ///
3647 /// This intrinsic is a utility function and does not correspond to a specific
3648 /// instruction.
3649 ///
3650 /// \param __q
3651 /// Integer value used to initialize the elements of the destination integer
3652 /// vector.
3653 /// \returns An initialized 128-bit integer vector of [2 x i64] with both
3654 /// elements containing the value provided in the operand.
_mm_set1_epi64x(long long __q)3655 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
3656 return _mm_set_epi64x(__q, __q);
3657 }
3658
3659 /// Initializes both values in a 128-bit vector of [2 x i64] with the
3660 /// specified 64-bit value.
3661 ///
3662 /// \headerfile <x86intrin.h>
3663 ///
3664 /// This intrinsic is a utility function and does not correspond to a specific
3665 /// instruction.
3666 ///
3667 /// \param __q
3668 /// A 64-bit value used to initialize the elements of the destination integer
3669 /// vector.
3670 /// \returns An initialized 128-bit vector of [2 x i64] with all elements
3671 /// containing the value provided in the operand.
_mm_set1_epi64(__m64 __q)3672 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
3673 return _mm_set_epi64(__q, __q);
3674 }
3675
3676 /// Initializes all values in a 128-bit vector of [4 x i32] with the
3677 /// specified 32-bit value.
3678 ///
3679 /// \headerfile <x86intrin.h>
3680 ///
3681 /// This intrinsic is a utility function and does not correspond to a specific
3682 /// instruction.
3683 ///
3684 /// \param __i
3685 /// A 32-bit value used to initialize the elements of the destination integer
3686 /// vector.
3687 /// \returns An initialized 128-bit vector of [4 x i32] with all elements
3688 /// containing the value provided in the operand.
_mm_set1_epi32(int __i)3689 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
3690 return _mm_set_epi32(__i, __i, __i, __i);
3691 }
3692
3693 /// Initializes all values in a 128-bit vector of [8 x i16] with the
3694 /// specified 16-bit value.
3695 ///
3696 /// \headerfile <x86intrin.h>
3697 ///
3698 /// This intrinsic is a utility function and does not correspond to a specific
3699 /// instruction.
3700 ///
3701 /// \param __w
3702 /// A 16-bit value used to initialize the elements of the destination integer
3703 /// vector.
3704 /// \returns An initialized 128-bit vector of [8 x i16] with all elements
3705 /// containing the value provided in the operand.
_mm_set1_epi16(short __w)3706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
3707 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
3708 }
3709
3710 /// Initializes all values in a 128-bit vector of [16 x i8] with the
3711 /// specified 8-bit value.
3712 ///
3713 /// \headerfile <x86intrin.h>
3714 ///
3715 /// This intrinsic is a utility function and does not correspond to a specific
3716 /// instruction.
3717 ///
3718 /// \param __b
3719 /// An 8-bit value used to initialize the elements of the destination integer
3720 /// vector.
3721 /// \returns An initialized 128-bit vector of [16 x i8] with all elements
3722 /// containing the value provided in the operand.
_mm_set1_epi8(char __b)3723 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
3724 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
3725 __b, __b, __b, __b, __b);
3726 }
3727
3728 /// Constructs a 128-bit integer vector, initialized in reverse order
3729 /// with the specified 64-bit integral values.
3730 ///
3731 /// \headerfile <x86intrin.h>
3732 ///
3733 /// This intrinsic does not correspond to a specific instruction.
3734 ///
3735 /// \param __q0
3736 /// A 64-bit integral value used to initialize the lower 64 bits of the
3737 /// result.
3738 /// \param __q1
3739 /// A 64-bit integral value used to initialize the upper 64 bits of the
3740 /// result.
3741 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi64(__m64 __q0,__m64 __q1)3742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
3743 __m64 __q1) {
3744 return _mm_set_epi64(__q1, __q0);
3745 }
3746
3747 /// Constructs a 128-bit integer vector, initialized in reverse order
3748 /// with the specified 32-bit integral values.
3749 ///
3750 /// \headerfile <x86intrin.h>
3751 ///
3752 /// This intrinsic is a utility function and does not correspond to a specific
3753 /// instruction.
3754 ///
3755 /// \param __i0
3756 /// A 32-bit integral value used to initialize bits [31:0] of the result.
3757 /// \param __i1
3758 /// A 32-bit integral value used to initialize bits [63:32] of the result.
3759 /// \param __i2
3760 /// A 32-bit integral value used to initialize bits [95:64] of the result.
3761 /// \param __i3
3762 /// A 32-bit integral value used to initialize bits [127:96] of the result.
3763 /// \returns An initialized 128-bit integer vector.
_mm_setr_epi32(int __i0,int __i1,int __i2,int __i3)3764 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
3765 int __i2,
3766 int __i3) {
3767 return _mm_set_epi32(__i3, __i2, __i1, __i0);
3768 }
3769
3770 /// Constructs a 128-bit integer vector, initialized in reverse order
3771 /// with the specified 16-bit integral values.
3772 ///
3773 /// \headerfile <x86intrin.h>
3774 ///
3775 /// This intrinsic is a utility function and does not correspond to a specific
3776 /// instruction.
3777 ///
3778 /// \param __w0
3779 /// A 16-bit integral value used to initialize bits [15:0] of the result.
3780 /// \param __w1
3781 /// A 16-bit integral value used to initialize bits [31:16] of the result.
3782 /// \param __w2
3783 /// A 16-bit integral value used to initialize bits [47:32] of the result.
3784 /// \param __w3
3785 /// A 16-bit integral value used to initialize bits [63:48] of the result.
3786 /// \param __w4
3787 /// A 16-bit integral value used to initialize bits [79:64] of the result.
3788 /// \param __w5
3789 /// A 16-bit integral value used to initialize bits [95:80] of the result.
3790 /// \param __w6
3791 /// A 16-bit integral value used to initialize bits [111:96] of the result.
3792 /// \param __w7
3793 /// A 16-bit integral value used to initialize bits [127:112] of the result.
3794 /// \returns An initialized 128-bit integer vector.
3795 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi16(short __w0,short __w1,short __w2,short __w3,short __w4,short __w5,short __w6,short __w7)3796 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
3797 short __w5, short __w6, short __w7) {
3798 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
3799 }
3800
3801 /// Constructs a 128-bit integer vector, initialized in reverse order
3802 /// with the specified 8-bit integral values.
3803 ///
3804 /// \headerfile <x86intrin.h>
3805 ///
3806 /// This intrinsic is a utility function and does not correspond to a specific
3807 /// instruction.
3808 ///
3809 /// \param __b0
3810 /// An 8-bit integral value used to initialize bits [7:0] of the result.
3811 /// \param __b1
3812 /// An 8-bit integral value used to initialize bits [15:8] of the result.
3813 /// \param __b2
3814 /// An 8-bit integral value used to initialize bits [23:16] of the result.
3815 /// \param __b3
3816 /// An 8-bit integral value used to initialize bits [31:24] of the result.
3817 /// \param __b4
3818 /// An 8-bit integral value used to initialize bits [39:32] of the result.
3819 /// \param __b5
3820 /// An 8-bit integral value used to initialize bits [47:40] of the result.
3821 /// \param __b6
3822 /// An 8-bit integral value used to initialize bits [55:48] of the result.
3823 /// \param __b7
3824 /// An 8-bit integral value used to initialize bits [63:56] of the result.
3825 /// \param __b8
3826 /// An 8-bit integral value used to initialize bits [71:64] of the result.
3827 /// \param __b9
3828 /// An 8-bit integral value used to initialize bits [79:72] of the result.
3829 /// \param __b10
3830 /// An 8-bit integral value used to initialize bits [87:80] of the result.
3831 /// \param __b11
3832 /// An 8-bit integral value used to initialize bits [95:88] of the result.
3833 /// \param __b12
3834 /// An 8-bit integral value used to initialize bits [103:96] of the result.
3835 /// \param __b13
3836 /// An 8-bit integral value used to initialize bits [111:104] of the result.
3837 /// \param __b14
3838 /// An 8-bit integral value used to initialize bits [119:112] of the result.
3839 /// \param __b15
3840 /// An 8-bit integral value used to initialize bits [127:120] of the result.
3841 /// \returns An initialized 128-bit integer vector.
3842 static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_setr_epi8(char __b0,char __b1,char __b2,char __b3,char __b4,char __b5,char __b6,char __b7,char __b8,char __b9,char __b10,char __b11,char __b12,char __b13,char __b14,char __b15)3843 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
3844 char __b6, char __b7, char __b8, char __b9, char __b10,
3845 char __b11, char __b12, char __b13, char __b14, char __b15) {
3846 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
3847 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
3848 }
3849
3850 /// Creates a 128-bit integer vector initialized to zero.
3851 ///
3852 /// \headerfile <x86intrin.h>
3853 ///
3854 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
3855 ///
3856 /// \returns An initialized 128-bit integer vector with all elements set to
3857 /// zero.
_mm_setzero_si128(void)3858 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
3859 return __extension__(__m128i)(__v2di){0LL, 0LL};
3860 }
3861
3862 /// Stores a 128-bit integer vector to a memory location aligned on a
3863 /// 128-bit boundary.
3864 ///
3865 /// \headerfile <x86intrin.h>
3866 ///
3867 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
3868 ///
3869 /// \param __p
3870 /// A pointer to an aligned memory location that will receive the integer
3871 /// values.
3872 /// \param __b
3873 /// A 128-bit integer vector containing the values to be moved.
_mm_store_si128(__m128i * __p,__m128i __b)3874 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
3875 __m128i __b) {
3876 *__p = __b;
3877 }
3878
3879 /// Stores a 128-bit integer vector to an unaligned memory location.
3880 ///
3881 /// \headerfile <x86intrin.h>
3882 ///
3883 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
3884 ///
3885 /// \param __p
3886 /// A pointer to a memory location that will receive the integer values.
3887 /// \param __b
3888 /// A 128-bit integer vector containing the values to be moved.
_mm_storeu_si128(__m128i_u * __p,__m128i __b)3889 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
3890 __m128i __b) {
3891 struct __storeu_si128 {
3892 __m128i_u __v;
3893 } __attribute__((__packed__, __may_alias__));
3894 ((struct __storeu_si128 *)__p)->__v = __b;
3895 }
3896
3897 /// Stores a 64-bit integer value from the low element of a 128-bit integer
3898 /// vector.
3899 ///
3900 /// \headerfile <x86intrin.h>
3901 ///
3902 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
3903 ///
3904 /// \param __p
3905 /// A pointer to a 64-bit memory location. The address of the memory
3906 /// location does not have to be aligned.
3907 /// \param __b
3908 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si64(void * __p,__m128i __b)3909 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
3910 __m128i __b) {
3911 struct __storeu_si64 {
3912 long long __v;
3913 } __attribute__((__packed__, __may_alias__));
3914 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
3915 }
3916
3917 /// Stores a 32-bit integer value from the low element of a 128-bit integer
3918 /// vector.
3919 ///
3920 /// \headerfile <x86intrin.h>
3921 ///
3922 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
3923 ///
3924 /// \param __p
3925 /// A pointer to a 32-bit memory location. The address of the memory
3926 /// location does not have to be aligned.
3927 /// \param __b
3928 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si32(void * __p,__m128i __b)3929 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
3930 __m128i __b) {
3931 struct __storeu_si32 {
3932 int __v;
3933 } __attribute__((__packed__, __may_alias__));
3934 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
3935 }
3936
3937 /// Stores a 16-bit integer value from the low element of a 128-bit integer
3938 /// vector.
3939 ///
3940 /// \headerfile <x86intrin.h>
3941 ///
3942 /// This intrinsic does not correspond to a specific instruction.
3943 ///
3944 /// \param __p
3945 /// A pointer to a 16-bit memory location. The address of the memory
3946 /// location does not have to be aligned.
3947 /// \param __b
3948 /// A 128-bit integer vector containing the value to be stored.
_mm_storeu_si16(void * __p,__m128i __b)3949 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
3950 __m128i __b) {
3951 struct __storeu_si16 {
3952 short __v;
3953 } __attribute__((__packed__, __may_alias__));
3954 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
3955 }
3956
3957 /// Moves bytes selected by the mask from the first operand to the
3958 /// specified unaligned memory location. When a mask bit is 1, the
3959 /// corresponding byte is written, otherwise it is not written.
3960 ///
3961 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
3962 /// used again soon). Exception and trap behavior for elements not selected
3963 /// for storage to memory are implementation dependent.
3964 ///
3965 /// \headerfile <x86intrin.h>
3966 ///
3967 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
3968 /// instruction.
3969 ///
3970 /// \param __d
3971 /// A 128-bit integer vector containing the values to be moved.
3972 /// \param __n
3973 /// A 128-bit integer vector containing the mask. The most significant bit of
3974 /// each byte represents the mask bits.
3975 /// \param __p
3976 /// A pointer to an unaligned 128-bit memory location where the specified
3977 /// values are moved.
_mm_maskmoveu_si128(__m128i __d,__m128i __n,char * __p)3978 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
3979 __m128i __n,
3980 char *__p) {
3981 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
3982 }
3983
3984 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
3985 /// a memory location.
3986 ///
3987 /// \headerfile <x86intrin.h>
3988 ///
3989 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
3990 ///
3991 /// \param __p
3992 /// A pointer to a 64-bit memory location that will receive the lower 64 bits
3993 /// of the integer vector parameter.
3994 /// \param __a
3995 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
3996 /// value to be stored.
_mm_storel_epi64(__m128i_u * __p,__m128i __a)3997 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
3998 __m128i __a) {
3999 struct __mm_storel_epi64_struct {
4000 long long __u;
4001 } __attribute__((__packed__, __may_alias__));
4002 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
4003 }
4004
4005 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
4006 /// aligned memory location.
4007 ///
4008 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4009 /// used again soon).
4010 ///
4011 /// \headerfile <x86intrin.h>
4012 ///
4013 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4014 ///
4015 /// \param __p
4016 /// A pointer to the 128-bit aligned memory location used to store the value.
4017 /// \param __a
4018 /// A vector of [2 x double] containing the 64-bit values to be stored.
_mm_stream_pd(void * __p,__m128d __a)4019 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
4020 __m128d __a) {
4021 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
4022 }
4023
4024 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
4025 ///
4026 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4027 /// used again soon).
4028 ///
4029 /// \headerfile <x86intrin.h>
4030 ///
4031 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
4032 ///
4033 /// \param __p
4034 /// A pointer to the 128-bit aligned memory location used to store the value.
4035 /// \param __a
4036 /// A 128-bit integer vector containing the values to be stored.
_mm_stream_si128(void * __p,__m128i __a)4037 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
4038 __m128i __a) {
4039 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
4040 }
4041
4042 /// Stores a 32-bit integer value in the specified memory location.
4043 ///
4044 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4045 /// used again soon).
4046 ///
4047 /// \headerfile <x86intrin.h>
4048 ///
4049 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
4050 ///
4051 /// \param __p
4052 /// A pointer to the 32-bit memory location used to store the value.
4053 /// \param __a
4054 /// A 32-bit integer containing the value to be stored.
4055 static __inline__ void
4056 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(void * __p,int __a)4057 _mm_stream_si32(void *__p, int __a) {
4058 __builtin_ia32_movnti((int *)__p, __a);
4059 }
4060
4061 #ifdef __x86_64__
4062 /// Stores a 64-bit integer value in the specified memory location.
4063 ///
4064 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
4065 /// used again soon).
4066 ///
4067 /// \headerfile <x86intrin.h>
4068 ///
4069 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
4070 ///
4071 /// \param __p
4072 /// A pointer to the 64-bit memory location used to store the value.
4073 /// \param __a
4074 /// A 64-bit integer containing the value to be stored.
4075 static __inline__ void
4076 __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(void * __p,long long __a)4077 _mm_stream_si64(void *__p, long long __a) {
4078 __builtin_ia32_movnti64((long long *)__p, __a);
4079 }
4080 #endif
4081
4082 #if defined(__cplusplus)
4083 extern "C" {
4084 #endif
4085
4086 /// The cache line containing \a __p is flushed and invalidated from all
4087 /// caches in the coherency domain.
4088 ///
4089 /// \headerfile <x86intrin.h>
4090 ///
4091 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
4092 ///
4093 /// \param __p
4094 /// A pointer to the memory location used to identify the cache line to be
4095 /// flushed.
4096 void _mm_clflush(void const *__p);
4097
4098 /// Forces strong memory ordering (serialization) between load
4099 /// instructions preceding this instruction and load instructions following
4100 /// this instruction, ensuring the system completes all previous loads before
4101 /// executing subsequent loads.
4102 ///
4103 /// \headerfile <x86intrin.h>
4104 ///
4105 /// This intrinsic corresponds to the <c> LFENCE </c> instruction.
4106 ///
4107 void _mm_lfence(void);
4108
4109 /// Forces strong memory ordering (serialization) between load and store
4110 /// instructions preceding this instruction and load and store instructions
4111 /// following this instruction, ensuring that the system completes all
4112 /// previous memory accesses before executing subsequent memory accesses.
4113 ///
4114 /// \headerfile <x86intrin.h>
4115 ///
4116 /// This intrinsic corresponds to the <c> MFENCE </c> instruction.
4117 ///
4118 void _mm_mfence(void);
4119
4120 #if defined(__cplusplus)
4121 } // extern "C"
4122 #endif
4123
4124 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4125 /// vector operands into 8-bit signed integers, and packs the results into
4126 /// the destination.
4127 ///
4128 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
4129 /// less than 0x80 are saturated to 0x80.
4130 ///
4131 /// \headerfile <x86intrin.h>
4132 ///
4133 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
4134 ///
4135 /// \param __a
4136 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4137 /// written to the lower 64 bits of the result.
4138 /// \param __b
4139 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4140 /// written to the higher 64 bits of the result.
4141 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packs_epi16(__m128i __a,__m128i __b)4142 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
4143 __m128i __b) {
4144 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
4145 }
4146
4147 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer
4148 /// vector operands into 16-bit signed integers, and packs the results into
4149 /// the destination.
4150 ///
4151 /// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative
4152 /// values less than 0x8000 are saturated to 0x8000.
4153 ///
4154 /// \headerfile <x86intrin.h>
4155 ///
4156 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
4157 ///
4158 /// \param __a
4159 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4160 /// are written to the lower 64 bits of the result.
4161 /// \param __b
4162 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values
4163 /// are written to the higher 64 bits of the result.
4164 /// \returns A 128-bit vector of [8 x i16] containing the converted values.
_mm_packs_epi32(__m128i __a,__m128i __b)4165 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
4166 __m128i __b) {
4167 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
4168 }
4169
4170 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer
4171 /// vector operands into 8-bit unsigned integers, and packs the results into
4172 /// the destination.
4173 ///
4174 /// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00
4175 /// are saturated to 0x00.
4176 ///
4177 /// \headerfile <x86intrin.h>
4178 ///
4179 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
4180 ///
4181 /// \param __a
4182 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4183 /// written to the lower 64 bits of the result.
4184 /// \param __b
4185 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are
4186 /// written to the higher 64 bits of the result.
4187 /// \returns A 128-bit vector of [16 x i8] containing the converted values.
_mm_packus_epi16(__m128i __a,__m128i __b)4188 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
4189 __m128i __b) {
4190 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
4191 }
4192
4193 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
4194 /// the immediate-value parameter as a selector.
4195 ///
4196 /// \headerfile <x86intrin.h>
4197 ///
4198 /// \code
4199 /// __m128i _mm_extract_epi16(__m128i a, const int imm);
4200 /// \endcode
4201 ///
4202 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
4203 ///
4204 /// \param a
4205 /// A 128-bit integer vector.
4206 /// \param imm
4207 /// An immediate value. Bits [2:0] selects values from \a a to be assigned
4208 /// to bits[15:0] of the result. \n
4209 /// 000: assign values from bits [15:0] of \a a. \n
4210 /// 001: assign values from bits [31:16] of \a a. \n
4211 /// 010: assign values from bits [47:32] of \a a. \n
4212 /// 011: assign values from bits [63:48] of \a a. \n
4213 /// 100: assign values from bits [79:64] of \a a. \n
4214 /// 101: assign values from bits [95:80] of \a a. \n
4215 /// 110: assign values from bits [111:96] of \a a. \n
4216 /// 111: assign values from bits [127:112] of \a a.
4217 /// \returns An integer, whose lower 16 bits are selected from the 128-bit
4218 /// integer vector parameter and the remaining bits are assigned zeros.
4219 #define _mm_extract_epi16(a, imm) \
4220 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
4221 (int)(imm)))
4222
4223 /// Constructs a 128-bit integer vector by first making a copy of the
4224 /// 128-bit integer vector parameter, and then inserting the lower 16 bits
4225 /// of an integer parameter into an offset specified by the immediate-value
4226 /// parameter.
4227 ///
4228 /// \headerfile <x86intrin.h>
4229 ///
4230 /// \code
4231 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
4232 /// \endcode
4233 ///
4234 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
4235 ///
4236 /// \param a
4237 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the
4238 /// result and then one of the eight elements in the result is replaced by
4239 /// the lower 16 bits of \a b.
4240 /// \param b
4241 /// An integer. The lower 16 bits of this parameter are written to the
4242 /// result beginning at an offset specified by \a imm.
4243 /// \param imm
4244 /// An immediate value specifying the bit offset in the result at which the
4245 /// lower 16 bits of \a b are written.
4246 /// \returns A 128-bit integer vector containing the constructed values.
4247 #define _mm_insert_epi16(a, b, imm) \
4248 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
4249 (int)(imm)))
4250
4251 /// Copies the values of the most significant bits from each 8-bit
4252 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
4253 /// value, zero-extends the value, and writes it to the destination.
4254 ///
4255 /// \headerfile <x86intrin.h>
4256 ///
4257 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
4258 ///
4259 /// \param __a
4260 /// A 128-bit integer vector containing the values with bits to be extracted.
4261 /// \returns The most significant bits from each 8-bit element in \a __a,
4262 /// written to bits [15:0]. The other bits are assigned zeros.
_mm_movemask_epi8(__m128i __a)4263 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
4264 return __builtin_ia32_pmovmskb128((__v16qi)__a);
4265 }
4266
4267 /// Constructs a 128-bit integer vector by shuffling four 32-bit
4268 /// elements of a 128-bit integer vector parameter, using the immediate-value
4269 /// parameter as a specifier.
4270 ///
4271 /// \headerfile <x86intrin.h>
4272 ///
4273 /// \code
4274 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
4275 /// \endcode
4276 ///
4277 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
4278 ///
4279 /// \param a
4280 /// A 128-bit integer vector containing the values to be copied.
4281 /// \param imm
4282 /// An immediate value containing an 8-bit value specifying which elements to
4283 /// copy from a. The destinations within the 128-bit destination are assigned
4284 /// values as follows: \n
4285 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
4286 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
4287 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
4288 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
4289 /// Bit value assignments: \n
4290 /// 00: assign values from bits [31:0] of \a a. \n
4291 /// 01: assign values from bits [63:32] of \a a. \n
4292 /// 10: assign values from bits [95:64] of \a a. \n
4293 /// 11: assign values from bits [127:96] of \a a. \n
4294 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4295 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4296 /// <c>[b6, b4, b2, b0]</c>.
4297 /// \returns A 128-bit integer vector containing the shuffled values.
4298 #define _mm_shuffle_epi32(a, imm) \
4299 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
4300
4301 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit
4302 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4303 /// value parameter as a specifier.
4304 ///
4305 /// \headerfile <x86intrin.h>
4306 ///
4307 /// \code
4308 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
4309 /// \endcode
4310 ///
4311 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
4312 ///
4313 /// \param a
4314 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
4315 /// [127:64] of the result.
4316 /// \param imm
4317 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4318 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
4319 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
4320 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
4321 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
4322 /// Bit value assignments: \n
4323 /// 00: assign values from bits [15:0] of \a a. \n
4324 /// 01: assign values from bits [31:16] of \a a. \n
4325 /// 10: assign values from bits [47:32] of \a a. \n
4326 /// 11: assign values from bits [63:48] of \a a. \n
4327 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4328 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4329 /// <c>[b6, b4, b2, b0]</c>.
4330 /// \returns A 128-bit integer vector containing the shuffled values.
4331 #define _mm_shufflelo_epi16(a, imm) \
4332 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
4333
4334 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit
4335 /// elements of a 128-bit integer vector of [8 x i16], using the immediate
4336 /// value parameter as a specifier.
4337 ///
4338 /// \headerfile <x86intrin.h>
4339 ///
4340 /// \code
4341 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
4342 /// \endcode
4343 ///
4344 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
4345 ///
4346 /// \param a
4347 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
4348 /// [63:0] of the result.
4349 /// \param imm
4350 /// An 8-bit immediate value specifying which elements to copy from \a a. \n
4351 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
4352 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
4353 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
4354 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
4355 /// Bit value assignments: \n
4356 /// 00: assign values from bits [79:64] of \a a. \n
4357 /// 01: assign values from bits [95:80] of \a a. \n
4358 /// 10: assign values from bits [111:96] of \a a. \n
4359 /// 11: assign values from bits [127:112] of \a a. \n
4360 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
4361 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
4362 /// <c>[b6, b4, b2, b0]</c>.
4363 /// \returns A 128-bit integer vector containing the shuffled values.
4364 #define _mm_shufflehi_epi16(a, imm) \
4365 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
4366
4367 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors
4368 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4369 ///
4370 /// \headerfile <x86intrin.h>
4371 ///
4372 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
4373 /// instruction.
4374 ///
4375 /// \param __a
4376 /// A 128-bit vector of [16 x i8].
4377 /// Bits [71:64] are written to bits [7:0] of the result. \n
4378 /// Bits [79:72] are written to bits [23:16] of the result. \n
4379 /// Bits [87:80] are written to bits [39:32] of the result. \n
4380 /// Bits [95:88] are written to bits [55:48] of the result. \n
4381 /// Bits [103:96] are written to bits [71:64] of the result. \n
4382 /// Bits [111:104] are written to bits [87:80] of the result. \n
4383 /// Bits [119:112] are written to bits [103:96] of the result. \n
4384 /// Bits [127:120] are written to bits [119:112] of the result.
4385 /// \param __b
4386 /// A 128-bit vector of [16 x i8]. \n
4387 /// Bits [71:64] are written to bits [15:8] of the result. \n
4388 /// Bits [79:72] are written to bits [31:24] of the result. \n
4389 /// Bits [87:80] are written to bits [47:40] of the result. \n
4390 /// Bits [95:88] are written to bits [63:56] of the result. \n
4391 /// Bits [103:96] are written to bits [79:72] of the result. \n
4392 /// Bits [111:104] are written to bits [95:88] of the result. \n
4393 /// Bits [119:112] are written to bits [111:104] of the result. \n
4394 /// Bits [127:120] are written to bits [127:120] of the result.
4395 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpackhi_epi8(__m128i __a,__m128i __b)4396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
4397 __m128i __b) {
4398 return (__m128i)__builtin_shufflevector(
4399 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
4400 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
4401 }
4402
4403 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
4404 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
4405 ///
4406 /// \headerfile <x86intrin.h>
4407 ///
4408 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
4409 /// instruction.
4410 ///
4411 /// \param __a
4412 /// A 128-bit vector of [8 x i16].
4413 /// Bits [79:64] are written to bits [15:0] of the result. \n
4414 /// Bits [95:80] are written to bits [47:32] of the result. \n
4415 /// Bits [111:96] are written to bits [79:64] of the result. \n
4416 /// Bits [127:112] are written to bits [111:96] of the result.
4417 /// \param __b
4418 /// A 128-bit vector of [8 x i16].
4419 /// Bits [79:64] are written to bits [31:16] of the result. \n
4420 /// Bits [95:80] are written to bits [63:48] of the result. \n
4421 /// Bits [111:96] are written to bits [95:80] of the result. \n
4422 /// Bits [127:112] are written to bits [127:112] of the result.
4423 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpackhi_epi16(__m128i __a,__m128i __b)4424 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
4425 __m128i __b) {
4426 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
4427 8 + 5, 6, 8 + 6, 7, 8 + 7);
4428 }
4429
4430 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
4431 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4432 ///
4433 /// \headerfile <x86intrin.h>
4434 ///
4435 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
4436 /// instruction.
4437 ///
4438 /// \param __a
4439 /// A 128-bit vector of [4 x i32]. \n
4440 /// Bits [95:64] are written to bits [31:0] of the destination. \n
4441 /// Bits [127:96] are written to bits [95:64] of the destination.
4442 /// \param __b
4443 /// A 128-bit vector of [4 x i32]. \n
4444 /// Bits [95:64] are written to bits [64:32] of the destination. \n
4445 /// Bits [127:96] are written to bits [127:96] of the destination.
4446 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpackhi_epi32(__m128i __a,__m128i __b)4447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
4448 __m128i __b) {
4449 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
4450 4 + 3);
4451 }
4452
4453 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4454 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4455 ///
4456 /// \headerfile <x86intrin.h>
4457 ///
4458 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
4459 /// instruction.
4460 ///
4461 /// \param __a
4462 /// A 128-bit vector of [2 x i64]. \n
4463 /// Bits [127:64] are written to bits [63:0] of the destination.
4464 /// \param __b
4465 /// A 128-bit vector of [2 x i64]. \n
4466 /// Bits [127:64] are written to bits [127:64] of the destination.
4467 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpackhi_epi64(__m128i __a,__m128i __b)4468 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
4469 __m128i __b) {
4470 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
4471 }
4472
4473 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
4474 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
4475 ///
4476 /// \headerfile <x86intrin.h>
4477 ///
4478 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
4479 /// instruction.
4480 ///
4481 /// \param __a
4482 /// A 128-bit vector of [16 x i8]. \n
4483 /// Bits [7:0] are written to bits [7:0] of the result. \n
4484 /// Bits [15:8] are written to bits [23:16] of the result. \n
4485 /// Bits [23:16] are written to bits [39:32] of the result. \n
4486 /// Bits [31:24] are written to bits [55:48] of the result. \n
4487 /// Bits [39:32] are written to bits [71:64] of the result. \n
4488 /// Bits [47:40] are written to bits [87:80] of the result. \n
4489 /// Bits [55:48] are written to bits [103:96] of the result. \n
4490 /// Bits [63:56] are written to bits [119:112] of the result.
4491 /// \param __b
4492 /// A 128-bit vector of [16 x i8].
4493 /// Bits [7:0] are written to bits [15:8] of the result. \n
4494 /// Bits [15:8] are written to bits [31:24] of the result. \n
4495 /// Bits [23:16] are written to bits [47:40] of the result. \n
4496 /// Bits [31:24] are written to bits [63:56] of the result. \n
4497 /// Bits [39:32] are written to bits [79:72] of the result. \n
4498 /// Bits [47:40] are written to bits [95:88] of the result. \n
4499 /// Bits [55:48] are written to bits [111:104] of the result. \n
4500 /// Bits [63:56] are written to bits [127:120] of the result.
4501 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
_mm_unpacklo_epi8(__m128i __a,__m128i __b)4502 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
4503 __m128i __b) {
4504 return (__m128i)__builtin_shufflevector(
4505 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
4506 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
4507 }
4508
4509 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit
4510 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of
4511 /// [8 x i16].
4512 ///
4513 /// \headerfile <x86intrin.h>
4514 ///
4515 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
4516 /// instruction.
4517 ///
4518 /// \param __a
4519 /// A 128-bit vector of [8 x i16].
4520 /// Bits [15:0] are written to bits [15:0] of the result. \n
4521 /// Bits [31:16] are written to bits [47:32] of the result. \n
4522 /// Bits [47:32] are written to bits [79:64] of the result. \n
4523 /// Bits [63:48] are written to bits [111:96] of the result.
4524 /// \param __b
4525 /// A 128-bit vector of [8 x i16].
4526 /// Bits [15:0] are written to bits [31:16] of the result. \n
4527 /// Bits [31:16] are written to bits [63:48] of the result. \n
4528 /// Bits [47:32] are written to bits [95:80] of the result. \n
4529 /// Bits [63:48] are written to bits [127:112] of the result.
4530 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
_mm_unpacklo_epi16(__m128i __a,__m128i __b)4531 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
4532 __m128i __b) {
4533 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
4534 8 + 1, 2, 8 + 2, 3, 8 + 3);
4535 }
4536
4537 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
4538 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
4539 ///
4540 /// \headerfile <x86intrin.h>
4541 ///
4542 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
4543 /// instruction.
4544 ///
4545 /// \param __a
4546 /// A 128-bit vector of [4 x i32]. \n
4547 /// Bits [31:0] are written to bits [31:0] of the destination. \n
4548 /// Bits [63:32] are written to bits [95:64] of the destination.
4549 /// \param __b
4550 /// A 128-bit vector of [4 x i32]. \n
4551 /// Bits [31:0] are written to bits [64:32] of the destination. \n
4552 /// Bits [63:32] are written to bits [127:96] of the destination.
4553 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
_mm_unpacklo_epi32(__m128i __a,__m128i __b)4554 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
4555 __m128i __b) {
4556 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
4557 4 + 1);
4558 }
4559
4560 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of
4561 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
4562 ///
4563 /// \headerfile <x86intrin.h>
4564 ///
4565 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
4566 /// instruction.
4567 ///
4568 /// \param __a
4569 /// A 128-bit vector of [2 x i64]. \n
4570 /// Bits [63:0] are written to bits [63:0] of the destination. \n
4571 /// \param __b
4572 /// A 128-bit vector of [2 x i64]. \n
4573 /// Bits [63:0] are written to bits [127:64] of the destination. \n
4574 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
_mm_unpacklo_epi64(__m128i __a,__m128i __b)4575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
4576 __m128i __b) {
4577 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
4578 }
4579
4580 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
4581 /// integer.
4582 ///
4583 /// \headerfile <x86intrin.h>
4584 ///
4585 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
4586 ///
4587 /// \param __a
4588 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4589 /// destination.
4590 /// \returns A 64-bit integer containing the lower 64 bits of the parameter.
_mm_movepi64_pi64(__m128i __a)4591 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
4592 return (__m64)__a[0];
4593 }
4594
4595 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
4596 /// upper bits.
4597 ///
4598 /// \headerfile <x86intrin.h>
4599 ///
4600 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
4601 ///
4602 /// \param __a
4603 /// A 64-bit value.
4604 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4605 /// the operand. The upper 64 bits are assigned zeros.
_mm_movpi64_epi64(__m64 __a)4606 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
4607 return __extension__(__m128i)(__v2di){(long long)__a, 0};
4608 }
4609
4610 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
4611 /// integer vector, zeroing the upper bits.
4612 ///
4613 /// \headerfile <x86intrin.h>
4614 ///
4615 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
4616 ///
4617 /// \param __a
4618 /// A 128-bit integer vector operand. The lower 64 bits are moved to the
4619 /// destination.
4620 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from
4621 /// the operand. The upper 64 bits are assigned zeros.
_mm_move_epi64(__m128i __a)4622 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
4623 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
4624 }
4625
4626 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of
4627 /// [2 x double] and interleaves them into a 128-bit vector of [2 x
4628 /// double].
4629 ///
4630 /// \headerfile <x86intrin.h>
4631 ///
4632 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
4633 ///
4634 /// \param __a
4635 /// A 128-bit vector of [2 x double]. \n
4636 /// Bits [127:64] are written to bits [63:0] of the destination.
4637 /// \param __b
4638 /// A 128-bit vector of [2 x double]. \n
4639 /// Bits [127:64] are written to bits [127:64] of the destination.
4640 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpackhi_pd(__m128d __a,__m128d __b)4641 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
4642 __m128d __b) {
4643 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
4644 }
4645
4646 /// Unpacks the low-order 64-bit elements from two 128-bit vectors
4647 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x
4648 /// double].
4649 ///
4650 /// \headerfile <x86intrin.h>
4651 ///
4652 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
4653 ///
4654 /// \param __a
4655 /// A 128-bit vector of [2 x double]. \n
4656 /// Bits [63:0] are written to bits [63:0] of the destination.
4657 /// \param __b
4658 /// A 128-bit vector of [2 x double]. \n
4659 /// Bits [63:0] are written to bits [127:64] of the destination.
4660 /// \returns A 128-bit vector of [2 x double] containing the interleaved values.
_mm_unpacklo_pd(__m128d __a,__m128d __b)4661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
4662 __m128d __b) {
4663 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
4664 }
4665
4666 /// Extracts the sign bits of the double-precision values in the 128-bit
4667 /// vector of [2 x double], zero-extends the value, and writes it to the
4668 /// low-order bits of the destination.
4669 ///
4670 /// \headerfile <x86intrin.h>
4671 ///
4672 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
4673 ///
4674 /// \param __a
4675 /// A 128-bit vector of [2 x double] containing the values with sign bits to
4676 /// be extracted.
4677 /// \returns The sign bits from each of the double-precision elements in \a __a,
4678 /// written to bits [1:0]. The remaining bits are assigned values of zero.
_mm_movemask_pd(__m128d __a)4679 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
4680 return __builtin_ia32_movmskpd((__v2df)__a);
4681 }
4682
4683 /// Constructs a 128-bit floating-point vector of [2 x double] from two
4684 /// 128-bit vector parameters of [2 x double], using the immediate-value
4685 /// parameter as a specifier.
4686 ///
4687 /// \headerfile <x86intrin.h>
4688 ///
4689 /// \code
4690 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
4691 /// \endcode
4692 ///
4693 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
4694 ///
4695 /// \param a
4696 /// A 128-bit vector of [2 x double].
4697 /// \param b
4698 /// A 128-bit vector of [2 x double].
4699 /// \param i
4700 /// An 8-bit immediate value. The least significant two bits specify which
4701 /// elements to copy from \a a and \a b: \n
4702 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
4703 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
4704 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
4705 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
4706 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
4707 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
4708 /// <c>[b1, b0]</c>.
4709 /// \returns A 128-bit vector of [2 x double] containing the shuffled values.
4710 #define _mm_shuffle_pd(a, b, i) \
4711 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4712 (int)(i)))
4713
4714 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4715 /// floating-point vector of [4 x float].
4716 ///
4717 /// \headerfile <x86intrin.h>
4718 ///
4719 /// This intrinsic has no corresponding instruction.
4720 ///
4721 /// \param __a
4722 /// A 128-bit floating-point vector of [2 x double].
4723 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4724 /// bitwise pattern as the parameter.
_mm_castpd_ps(__m128d __a)4725 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
4726 return (__m128)__a;
4727 }
4728
4729 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
4730 /// integer vector.
4731 ///
4732 /// \headerfile <x86intrin.h>
4733 ///
4734 /// This intrinsic has no corresponding instruction.
4735 ///
4736 /// \param __a
4737 /// A 128-bit floating-point vector of [2 x double].
4738 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4739 /// parameter.
_mm_castpd_si128(__m128d __a)4740 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
4741 return (__m128i)__a;
4742 }
4743
4744 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4745 /// floating-point vector of [2 x double].
4746 ///
4747 /// \headerfile <x86intrin.h>
4748 ///
4749 /// This intrinsic has no corresponding instruction.
4750 ///
4751 /// \param __a
4752 /// A 128-bit floating-point vector of [4 x float].
4753 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4754 /// bitwise pattern as the parameter.
_mm_castps_pd(__m128 __a)4755 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
4756 return (__m128d)__a;
4757 }
4758
4759 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
4760 /// integer vector.
4761 ///
4762 /// \headerfile <x86intrin.h>
4763 ///
4764 /// This intrinsic has no corresponding instruction.
4765 ///
4766 /// \param __a
4767 /// A 128-bit floating-point vector of [4 x float].
4768 /// \returns A 128-bit integer vector containing the same bitwise pattern as the
4769 /// parameter.
_mm_castps_si128(__m128 __a)4770 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
4771 return (__m128i)__a;
4772 }
4773
4774 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4775 /// of [4 x float].
4776 ///
4777 /// \headerfile <x86intrin.h>
4778 ///
4779 /// This intrinsic has no corresponding instruction.
4780 ///
4781 /// \param __a
4782 /// A 128-bit integer vector.
4783 /// \returns A 128-bit floating-point vector of [4 x float] containing the same
4784 /// bitwise pattern as the parameter.
_mm_castsi128_ps(__m128i __a)4785 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
4786 return (__m128)__a;
4787 }
4788
4789 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
4790 /// of [2 x double].
4791 ///
4792 /// \headerfile <x86intrin.h>
4793 ///
4794 /// This intrinsic has no corresponding instruction.
4795 ///
4796 /// \param __a
4797 /// A 128-bit integer vector.
4798 /// \returns A 128-bit floating-point vector of [2 x double] containing the same
4799 /// bitwise pattern as the parameter.
_mm_castsi128_pd(__m128i __a)4800 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
4801 return (__m128d)__a;
4802 }
4803
4804 /// Compares each of the corresponding double-precision values of two
4805 /// 128-bit vectors of [2 x double], using the operation specified by the
4806 /// immediate integer operand.
4807 ///
4808 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4809 /// If either value in a comparison is NaN, comparisons that are ordered
4810 /// return false, and comparisons that are unordered return true.
4811 ///
4812 /// \headerfile <x86intrin.h>
4813 ///
4814 /// \code
4815 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
4816 /// \endcode
4817 ///
4818 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction.
4819 ///
4820 /// \param a
4821 /// A 128-bit vector of [2 x double].
4822 /// \param b
4823 /// A 128-bit vector of [2 x double].
4824 /// \param c
4825 /// An immediate integer operand, with bits [4:0] specifying which comparison
4826 /// operation to use: \n
4827 /// 0x00: Equal (ordered, non-signaling) \n
4828 /// 0x01: Less-than (ordered, signaling) \n
4829 /// 0x02: Less-than-or-equal (ordered, signaling) \n
4830 /// 0x03: Unordered (non-signaling) \n
4831 /// 0x04: Not-equal (unordered, non-signaling) \n
4832 /// 0x05: Not-less-than (unordered, signaling) \n
4833 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4834 /// 0x07: Ordered (non-signaling) \n
4835 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4836 #define _mm_cmp_pd(a, b, c) \
4837 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4838 (c)))
4839
4840 /// Compares each of the corresponding scalar double-precision values of
4841 /// two 128-bit vectors of [2 x double], using the operation specified by the
4842 /// immediate integer operand.
4843 ///
4844 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
4845 /// If either value in a comparison is NaN, comparisons that are ordered
4846 /// return false, and comparisons that are unordered return true.
4847 ///
4848 /// \headerfile <x86intrin.h>
4849 ///
4850 /// \code
4851 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
4852 /// \endcode
4853 ///
4854 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction.
4855 ///
4856 /// \param a
4857 /// A 128-bit vector of [2 x double].
4858 /// \param b
4859 /// A 128-bit vector of [2 x double].
4860 /// \param c
4861 /// An immediate integer operand, with bits [4:0] specifying which comparison
4862 /// operation to use: \n
4863 /// 0x00: Equal (ordered, non-signaling) \n
4864 /// 0x01: Less-than (ordered, signaling) \n
4865 /// 0x02: Less-than-or-equal (ordered, signaling) \n
4866 /// 0x03: Unordered (non-signaling) \n
4867 /// 0x04: Not-equal (unordered, non-signaling) \n
4868 /// 0x05: Not-less-than (unordered, signaling) \n
4869 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n
4870 /// 0x07: Ordered (non-signaling) \n
4871 /// \returns A 128-bit vector of [2 x double] containing the comparison results.
4872 #define _mm_cmp_sd(a, b, c) \
4873 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
4874 (c)))
4875
4876 #if defined(__cplusplus)
4877 extern "C" {
4878 #endif
4879
4880 /// Indicates that a spin loop is being executed for the purposes of
4881 /// optimizing power consumption during the loop.
4882 ///
4883 /// \headerfile <x86intrin.h>
4884 ///
4885 /// This intrinsic corresponds to the <c> PAUSE </c> instruction.
4886 ///
4887 void _mm_pause(void);
4888
4889 #if defined(__cplusplus)
4890 } // extern "C"
4891 #endif
4892 #undef __DEFAULT_FN_ATTRS
4893 #undef __DEFAULT_FN_ATTRS_MMX
4894
4895 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
4896
4897 #define _MM_DENORMALS_ZERO_ON (0x0040U)
4898 #define _MM_DENORMALS_ZERO_OFF (0x0000U)
4899
4900 #define _MM_DENORMALS_ZERO_MASK (0x0040U)
4901
4902 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
4903 #define _MM_SET_DENORMALS_ZERO_MODE(x) \
4904 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
4905
4906 #endif /* __EMMINTRIN_H */
4907