1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __EMMINTRIN_H 11 #define __EMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <xmmintrin.h> 18 19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 21 22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 23 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); 24 25 /* Type defines. */ 26 typedef double __v2df __attribute__ ((__vector_size__ (16))); 27 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 28 typedef short __v8hi __attribute__((__vector_size__(16))); 29 typedef char __v16qi __attribute__((__vector_size__(16))); 30 31 /* Unsigned types */ 32 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 33 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 34 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 35 36 /* We need an explicitly signed variant for char. Note that this shouldn't 37 * appear in the interface though. */ 38 typedef signed char __v16qs __attribute__((__vector_size__(16))); 39 40 /* Define the default attributes for the functions in this file. */ 41 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) 42 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) 43 44 /// Adds lower double-precision values in both operands and returns the 45 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 46 /// are copied from the upper double-precision value of the first operand. 47 /// 48 /// \headerfile <x86intrin.h> 49 /// 50 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 51 /// 52 /// \param __a 53 /// A 128-bit vector of [2 x double] containing one of the source operands. 54 /// \param __b 55 /// A 128-bit vector of [2 x double] containing one of the source operands. 56 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 57 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 58 /// from the upper 64 bits of the first source operand. 59 static __inline__ __m128d __DEFAULT_FN_ATTRS 60 _mm_add_sd(__m128d __a, __m128d __b) 61 { 62 __a[0] += __b[0]; 63 return __a; 64 } 65 66 /// Adds two 128-bit vectors of [2 x double]. 67 /// 68 /// \headerfile <x86intrin.h> 69 /// 70 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 71 /// 72 /// \param __a 73 /// A 128-bit vector of [2 x double] containing one of the source operands. 74 /// \param __b 75 /// A 128-bit vector of [2 x double] containing one of the source operands. 76 /// \returns A 128-bit vector of [2 x double] containing the sums of both 77 /// operands. 78 static __inline__ __m128d __DEFAULT_FN_ATTRS 79 _mm_add_pd(__m128d __a, __m128d __b) 80 { 81 return (__m128d)((__v2df)__a + (__v2df)__b); 82 } 83 84 /// Subtracts the lower double-precision value of the second operand 85 /// from the lower double-precision value of the first operand and returns 86 /// the difference in the lower 64 bits of the result. The upper 64 bits of 87 /// the result are copied from the upper double-precision value of the first 88 /// operand. 89 /// 90 /// \headerfile <x86intrin.h> 91 /// 92 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 93 /// 94 /// \param __a 95 /// A 128-bit vector of [2 x double] containing the minuend. 96 /// \param __b 97 /// A 128-bit vector of [2 x double] containing the subtrahend. 98 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 99 /// difference of the lower 64 bits of both operands. The upper 64 bits are 100 /// copied from the upper 64 bits of the first source operand. 101 static __inline__ __m128d __DEFAULT_FN_ATTRS 102 _mm_sub_sd(__m128d __a, __m128d __b) 103 { 104 __a[0] -= __b[0]; 105 return __a; 106 } 107 108 /// Subtracts two 128-bit vectors of [2 x double]. 109 /// 110 /// \headerfile <x86intrin.h> 111 /// 112 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 113 /// 114 /// \param __a 115 /// A 128-bit vector of [2 x double] containing the minuend. 116 /// \param __b 117 /// A 128-bit vector of [2 x double] containing the subtrahend. 118 /// \returns A 128-bit vector of [2 x double] containing the differences between 119 /// both operands. 120 static __inline__ __m128d __DEFAULT_FN_ATTRS 121 _mm_sub_pd(__m128d __a, __m128d __b) 122 { 123 return (__m128d)((__v2df)__a - (__v2df)__b); 124 } 125 126 /// Multiplies lower double-precision values in both operands and returns 127 /// the product in the lower 64 bits of the result. The upper 64 bits of the 128 /// result are copied from the upper double-precision value of the first 129 /// operand. 130 /// 131 /// \headerfile <x86intrin.h> 132 /// 133 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 134 /// 135 /// \param __a 136 /// A 128-bit vector of [2 x double] containing one of the source operands. 137 /// \param __b 138 /// A 128-bit vector of [2 x double] containing one of the source operands. 139 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 140 /// product of the lower 64 bits of both operands. The upper 64 bits are 141 /// copied from the upper 64 bits of the first source operand. 142 static __inline__ __m128d __DEFAULT_FN_ATTRS 143 _mm_mul_sd(__m128d __a, __m128d __b) 144 { 145 __a[0] *= __b[0]; 146 return __a; 147 } 148 149 /// Multiplies two 128-bit vectors of [2 x double]. 150 /// 151 /// \headerfile <x86intrin.h> 152 /// 153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 154 /// 155 /// \param __a 156 /// A 128-bit vector of [2 x double] containing one of the operands. 157 /// \param __b 158 /// A 128-bit vector of [2 x double] containing one of the operands. 159 /// \returns A 128-bit vector of [2 x double] containing the products of both 160 /// operands. 161 static __inline__ __m128d __DEFAULT_FN_ATTRS 162 _mm_mul_pd(__m128d __a, __m128d __b) 163 { 164 return (__m128d)((__v2df)__a * (__v2df)__b); 165 } 166 167 /// Divides the lower double-precision value of the first operand by the 168 /// lower double-precision value of the second operand and returns the 169 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 170 /// result are copied from the upper double-precision value of the first 171 /// operand. 172 /// 173 /// \headerfile <x86intrin.h> 174 /// 175 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 176 /// 177 /// \param __a 178 /// A 128-bit vector of [2 x double] containing the dividend. 179 /// \param __b 180 /// A 128-bit vector of [2 x double] containing divisor. 181 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 182 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 183 /// copied from the upper 64 bits of the first source operand. 184 static __inline__ __m128d __DEFAULT_FN_ATTRS 185 _mm_div_sd(__m128d __a, __m128d __b) 186 { 187 __a[0] /= __b[0]; 188 return __a; 189 } 190 191 /// Performs an element-by-element division of two 128-bit vectors of 192 /// [2 x double]. 193 /// 194 /// \headerfile <x86intrin.h> 195 /// 196 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 197 /// 198 /// \param __a 199 /// A 128-bit vector of [2 x double] containing the dividend. 200 /// \param __b 201 /// A 128-bit vector of [2 x double] containing the divisor. 202 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 203 /// operands. 204 static __inline__ __m128d __DEFAULT_FN_ATTRS 205 _mm_div_pd(__m128d __a, __m128d __b) 206 { 207 return (__m128d)((__v2df)__a / (__v2df)__b); 208 } 209 210 /// Calculates the square root of the lower double-precision value of 211 /// the second operand and returns it in the lower 64 bits of the result. 212 /// The upper 64 bits of the result are copied from the upper 213 /// double-precision value of the first operand. 214 /// 215 /// \headerfile <x86intrin.h> 216 /// 217 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 218 /// 219 /// \param __a 220 /// A 128-bit vector of [2 x double] containing one of the operands. The 221 /// upper 64 bits of this operand are copied to the upper 64 bits of the 222 /// result. 223 /// \param __b 224 /// A 128-bit vector of [2 x double] containing one of the operands. The 225 /// square root is calculated using the lower 64 bits of this operand. 226 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 227 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 228 /// bits are copied from the upper 64 bits of operand \a __a. 229 static __inline__ __m128d __DEFAULT_FN_ATTRS 230 _mm_sqrt_sd(__m128d __a, __m128d __b) 231 { 232 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 233 return __extension__ (__m128d) { __c[0], __a[1] }; 234 } 235 236 /// Calculates the square root of the each of two values stored in a 237 /// 128-bit vector of [2 x double]. 238 /// 239 /// \headerfile <x86intrin.h> 240 /// 241 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 242 /// 243 /// \param __a 244 /// A 128-bit vector of [2 x double]. 245 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 246 /// values in the operand. 247 static __inline__ __m128d __DEFAULT_FN_ATTRS 248 _mm_sqrt_pd(__m128d __a) 249 { 250 return __builtin_ia32_sqrtpd((__v2df)__a); 251 } 252 253 /// Compares lower 64-bit double-precision values of both operands, and 254 /// returns the lesser of the pair of values in the lower 64-bits of the 255 /// result. The upper 64 bits of the result are copied from the upper 256 /// double-precision value of the first operand. 257 /// 258 /// \headerfile <x86intrin.h> 259 /// 260 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 261 /// 262 /// \param __a 263 /// A 128-bit vector of [2 x double] containing one of the operands. The 264 /// lower 64 bits of this operand are used in the comparison. 265 /// \param __b 266 /// A 128-bit vector of [2 x double] containing one of the operands. The 267 /// lower 64 bits of this operand are used in the comparison. 268 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 269 /// minimum value between both operands. The upper 64 bits are copied from 270 /// the upper 64 bits of the first source operand. 271 static __inline__ __m128d __DEFAULT_FN_ATTRS 272 _mm_min_sd(__m128d __a, __m128d __b) 273 { 274 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 275 } 276 277 /// Performs element-by-element comparison of the two 128-bit vectors of 278 /// [2 x double] and returns the vector containing the lesser of each pair of 279 /// values. 280 /// 281 /// \headerfile <x86intrin.h> 282 /// 283 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 284 /// 285 /// \param __a 286 /// A 128-bit vector of [2 x double] containing one of the operands. 287 /// \param __b 288 /// A 128-bit vector of [2 x double] containing one of the operands. 289 /// \returns A 128-bit vector of [2 x double] containing the minimum values 290 /// between both operands. 291 static __inline__ __m128d __DEFAULT_FN_ATTRS 292 _mm_min_pd(__m128d __a, __m128d __b) 293 { 294 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 295 } 296 297 /// Compares lower 64-bit double-precision values of both operands, and 298 /// returns the greater of the pair of values in the lower 64-bits of the 299 /// result. The upper 64 bits of the result are copied from the upper 300 /// double-precision value of the first operand. 301 /// 302 /// \headerfile <x86intrin.h> 303 /// 304 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 305 /// 306 /// \param __a 307 /// A 128-bit vector of [2 x double] containing one of the operands. The 308 /// lower 64 bits of this operand are used in the comparison. 309 /// \param __b 310 /// A 128-bit vector of [2 x double] containing one of the operands. The 311 /// lower 64 bits of this operand are used in the comparison. 312 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 313 /// maximum value between both operands. The upper 64 bits are copied from 314 /// the upper 64 bits of the first source operand. 315 static __inline__ __m128d __DEFAULT_FN_ATTRS 316 _mm_max_sd(__m128d __a, __m128d __b) 317 { 318 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 319 } 320 321 /// Performs element-by-element comparison of the two 128-bit vectors of 322 /// [2 x double] and returns the vector containing the greater of each pair 323 /// of values. 324 /// 325 /// \headerfile <x86intrin.h> 326 /// 327 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 328 /// 329 /// \param __a 330 /// A 128-bit vector of [2 x double] containing one of the operands. 331 /// \param __b 332 /// A 128-bit vector of [2 x double] containing one of the operands. 333 /// \returns A 128-bit vector of [2 x double] containing the maximum values 334 /// between both operands. 335 static __inline__ __m128d __DEFAULT_FN_ATTRS 336 _mm_max_pd(__m128d __a, __m128d __b) 337 { 338 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 339 } 340 341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 342 /// 343 /// \headerfile <x86intrin.h> 344 /// 345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 346 /// 347 /// \param __a 348 /// A 128-bit vector of [2 x double] containing one of the source operands. 349 /// \param __b 350 /// A 128-bit vector of [2 x double] containing one of the source operands. 351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 352 /// values between both operands. 353 static __inline__ __m128d __DEFAULT_FN_ATTRS 354 _mm_and_pd(__m128d __a, __m128d __b) 355 { 356 return (__m128d)((__v2du)__a & (__v2du)__b); 357 } 358 359 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 360 /// the one's complement of the values contained in the first source operand. 361 /// 362 /// \headerfile <x86intrin.h> 363 /// 364 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 365 /// 366 /// \param __a 367 /// A 128-bit vector of [2 x double] containing the left source operand. The 368 /// one's complement of this value is used in the bitwise AND. 369 /// \param __b 370 /// A 128-bit vector of [2 x double] containing the right source operand. 371 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 372 /// values in the second operand and the one's complement of the first 373 /// operand. 374 static __inline__ __m128d __DEFAULT_FN_ATTRS 375 _mm_andnot_pd(__m128d __a, __m128d __b) 376 { 377 return (__m128d)(~(__v2du)__a & (__v2du)__b); 378 } 379 380 /// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 381 /// 382 /// \headerfile <x86intrin.h> 383 /// 384 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 385 /// 386 /// \param __a 387 /// A 128-bit vector of [2 x double] containing one of the source operands. 388 /// \param __b 389 /// A 128-bit vector of [2 x double] containing one of the source operands. 390 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 391 /// values between both operands. 392 static __inline__ __m128d __DEFAULT_FN_ATTRS 393 _mm_or_pd(__m128d __a, __m128d __b) 394 { 395 return (__m128d)((__v2du)__a | (__v2du)__b); 396 } 397 398 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 399 /// 400 /// \headerfile <x86intrin.h> 401 /// 402 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 403 /// 404 /// \param __a 405 /// A 128-bit vector of [2 x double] containing one of the source operands. 406 /// \param __b 407 /// A 128-bit vector of [2 x double] containing one of the source operands. 408 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 409 /// values between both operands. 410 static __inline__ __m128d __DEFAULT_FN_ATTRS 411 _mm_xor_pd(__m128d __a, __m128d __b) 412 { 413 return (__m128d)((__v2du)__a ^ (__v2du)__b); 414 } 415 416 /// Compares each of the corresponding double-precision values of the 417 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 418 /// for false, 0xFFFFFFFFFFFFFFFF for true. 419 /// 420 /// \headerfile <x86intrin.h> 421 /// 422 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 423 /// 424 /// \param __a 425 /// A 128-bit vector of [2 x double]. 426 /// \param __b 427 /// A 128-bit vector of [2 x double]. 428 /// \returns A 128-bit vector containing the comparison results. 429 static __inline__ __m128d __DEFAULT_FN_ATTRS 430 _mm_cmpeq_pd(__m128d __a, __m128d __b) 431 { 432 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 433 } 434 435 /// Compares each of the corresponding double-precision values of the 436 /// 128-bit vectors of [2 x double] to determine if the values in the first 437 /// operand are less than those in the second operand. Each comparison 438 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 439 /// 440 /// \headerfile <x86intrin.h> 441 /// 442 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 443 /// 444 /// \param __a 445 /// A 128-bit vector of [2 x double]. 446 /// \param __b 447 /// A 128-bit vector of [2 x double]. 448 /// \returns A 128-bit vector containing the comparison results. 449 static __inline__ __m128d __DEFAULT_FN_ATTRS 450 _mm_cmplt_pd(__m128d __a, __m128d __b) 451 { 452 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 453 } 454 455 /// Compares each of the corresponding double-precision values of the 456 /// 128-bit vectors of [2 x double] to determine if the values in the first 457 /// operand are less than or equal to those in the second operand. 458 /// 459 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 460 /// 461 /// \headerfile <x86intrin.h> 462 /// 463 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 464 /// 465 /// \param __a 466 /// A 128-bit vector of [2 x double]. 467 /// \param __b 468 /// A 128-bit vector of [2 x double]. 469 /// \returns A 128-bit vector containing the comparison results. 470 static __inline__ __m128d __DEFAULT_FN_ATTRS 471 _mm_cmple_pd(__m128d __a, __m128d __b) 472 { 473 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 474 } 475 476 /// Compares each of the corresponding double-precision values of the 477 /// 128-bit vectors of [2 x double] to determine if the values in the first 478 /// operand are greater than those in the second operand. 479 /// 480 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 481 /// 482 /// \headerfile <x86intrin.h> 483 /// 484 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 485 /// 486 /// \param __a 487 /// A 128-bit vector of [2 x double]. 488 /// \param __b 489 /// A 128-bit vector of [2 x double]. 490 /// \returns A 128-bit vector containing the comparison results. 491 static __inline__ __m128d __DEFAULT_FN_ATTRS 492 _mm_cmpgt_pd(__m128d __a, __m128d __b) 493 { 494 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 495 } 496 497 /// Compares each of the corresponding double-precision values of the 498 /// 128-bit vectors of [2 x double] to determine if the values in the first 499 /// operand are greater than or equal to those in the second operand. 500 /// 501 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 502 /// 503 /// \headerfile <x86intrin.h> 504 /// 505 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 506 /// 507 /// \param __a 508 /// A 128-bit vector of [2 x double]. 509 /// \param __b 510 /// A 128-bit vector of [2 x double]. 511 /// \returns A 128-bit vector containing the comparison results. 512 static __inline__ __m128d __DEFAULT_FN_ATTRS 513 _mm_cmpge_pd(__m128d __a, __m128d __b) 514 { 515 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 516 } 517 518 /// Compares each of the corresponding double-precision values of the 519 /// 128-bit vectors of [2 x double] to determine if the values in the first 520 /// operand are ordered with respect to those in the second operand. 521 /// 522 /// A pair of double-precision values are "ordered" with respect to each 523 /// other if neither value is a NaN. Each comparison yields 0x0 for false, 524 /// 0xFFFFFFFFFFFFFFFF for true. 525 /// 526 /// \headerfile <x86intrin.h> 527 /// 528 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 529 /// 530 /// \param __a 531 /// A 128-bit vector of [2 x double]. 532 /// \param __b 533 /// A 128-bit vector of [2 x double]. 534 /// \returns A 128-bit vector containing the comparison results. 535 static __inline__ __m128d __DEFAULT_FN_ATTRS 536 _mm_cmpord_pd(__m128d __a, __m128d __b) 537 { 538 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 539 } 540 541 /// Compares each of the corresponding double-precision values of the 542 /// 128-bit vectors of [2 x double] to determine if the values in the first 543 /// operand are unordered with respect to those in the second operand. 544 /// 545 /// A pair of double-precision values are "unordered" with respect to each 546 /// other if one or both values are NaN. Each comparison yields 0x0 for 547 /// false, 0xFFFFFFFFFFFFFFFF for true. 548 /// 549 /// \headerfile <x86intrin.h> 550 /// 551 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 552 /// instruction. 553 /// 554 /// \param __a 555 /// A 128-bit vector of [2 x double]. 556 /// \param __b 557 /// A 128-bit vector of [2 x double]. 558 /// \returns A 128-bit vector containing the comparison results. 559 static __inline__ __m128d __DEFAULT_FN_ATTRS 560 _mm_cmpunord_pd(__m128d __a, __m128d __b) 561 { 562 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 563 } 564 565 /// Compares each of the corresponding double-precision values of the 566 /// 128-bit vectors of [2 x double] to determine if the values in the first 567 /// operand are unequal to those in the second operand. 568 /// 569 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 570 /// 571 /// \headerfile <x86intrin.h> 572 /// 573 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 574 /// 575 /// \param __a 576 /// A 128-bit vector of [2 x double]. 577 /// \param __b 578 /// A 128-bit vector of [2 x double]. 579 /// \returns A 128-bit vector containing the comparison results. 580 static __inline__ __m128d __DEFAULT_FN_ATTRS 581 _mm_cmpneq_pd(__m128d __a, __m128d __b) 582 { 583 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 584 } 585 586 /// Compares each of the corresponding double-precision values of the 587 /// 128-bit vectors of [2 x double] to determine if the values in the first 588 /// operand are not less than those in the second operand. 589 /// 590 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 591 /// 592 /// \headerfile <x86intrin.h> 593 /// 594 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 595 /// 596 /// \param __a 597 /// A 128-bit vector of [2 x double]. 598 /// \param __b 599 /// A 128-bit vector of [2 x double]. 600 /// \returns A 128-bit vector containing the comparison results. 601 static __inline__ __m128d __DEFAULT_FN_ATTRS 602 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 603 { 604 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 605 } 606 607 /// Compares each of the corresponding double-precision values of the 608 /// 128-bit vectors of [2 x double] to determine if the values in the first 609 /// operand are not less than or equal to those in the second operand. 610 /// 611 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 612 /// 613 /// \headerfile <x86intrin.h> 614 /// 615 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 616 /// 617 /// \param __a 618 /// A 128-bit vector of [2 x double]. 619 /// \param __b 620 /// A 128-bit vector of [2 x double]. 621 /// \returns A 128-bit vector containing the comparison results. 622 static __inline__ __m128d __DEFAULT_FN_ATTRS 623 _mm_cmpnle_pd(__m128d __a, __m128d __b) 624 { 625 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 626 } 627 628 /// Compares each of the corresponding double-precision values of the 629 /// 128-bit vectors of [2 x double] to determine if the values in the first 630 /// operand are not greater than those in the second operand. 631 /// 632 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 633 /// 634 /// \headerfile <x86intrin.h> 635 /// 636 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 637 /// 638 /// \param __a 639 /// A 128-bit vector of [2 x double]. 640 /// \param __b 641 /// A 128-bit vector of [2 x double]. 642 /// \returns A 128-bit vector containing the comparison results. 643 static __inline__ __m128d __DEFAULT_FN_ATTRS 644 _mm_cmpngt_pd(__m128d __a, __m128d __b) 645 { 646 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 647 } 648 649 /// Compares each of the corresponding double-precision values of the 650 /// 128-bit vectors of [2 x double] to determine if the values in the first 651 /// operand are not greater than or equal to those in the second operand. 652 /// 653 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 654 /// 655 /// \headerfile <x86intrin.h> 656 /// 657 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 658 /// 659 /// \param __a 660 /// A 128-bit vector of [2 x double]. 661 /// \param __b 662 /// A 128-bit vector of [2 x double]. 663 /// \returns A 128-bit vector containing the comparison results. 664 static __inline__ __m128d __DEFAULT_FN_ATTRS 665 _mm_cmpnge_pd(__m128d __a, __m128d __b) 666 { 667 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 668 } 669 670 /// Compares the lower double-precision floating-point values in each of 671 /// the two 128-bit floating-point vectors of [2 x double] for equality. 672 /// 673 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 674 /// 675 /// \headerfile <x86intrin.h> 676 /// 677 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 678 /// 679 /// \param __a 680 /// A 128-bit vector of [2 x double]. The lower double-precision value is 681 /// compared to the lower double-precision value of \a __b. 682 /// \param __b 683 /// A 128-bit vector of [2 x double]. The lower double-precision value is 684 /// compared to the lower double-precision value of \a __a. 685 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 686 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 687 static __inline__ __m128d __DEFAULT_FN_ATTRS 688 _mm_cmpeq_sd(__m128d __a, __m128d __b) 689 { 690 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 691 } 692 693 /// Compares the lower double-precision floating-point values in each of 694 /// the two 128-bit floating-point vectors of [2 x double] to determine if 695 /// the value in the first parameter is less than the corresponding value in 696 /// the second parameter. 697 /// 698 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 699 /// 700 /// \headerfile <x86intrin.h> 701 /// 702 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 703 /// 704 /// \param __a 705 /// A 128-bit vector of [2 x double]. The lower double-precision value is 706 /// compared to the lower double-precision value of \a __b. 707 /// \param __b 708 /// A 128-bit vector of [2 x double]. The lower double-precision value is 709 /// compared to the lower double-precision value of \a __a. 710 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 711 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 712 static __inline__ __m128d __DEFAULT_FN_ATTRS 713 _mm_cmplt_sd(__m128d __a, __m128d __b) 714 { 715 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 716 } 717 718 /// Compares the lower double-precision floating-point values in each of 719 /// the two 128-bit floating-point vectors of [2 x double] to determine if 720 /// the value in the first parameter is less than or equal to the 721 /// corresponding value in the second parameter. 722 /// 723 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 724 /// 725 /// \headerfile <x86intrin.h> 726 /// 727 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 728 /// 729 /// \param __a 730 /// A 128-bit vector of [2 x double]. The lower double-precision value is 731 /// compared to the lower double-precision value of \a __b. 732 /// \param __b 733 /// A 128-bit vector of [2 x double]. The lower double-precision value is 734 /// compared to the lower double-precision value of \a __a. 735 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 736 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 737 static __inline__ __m128d __DEFAULT_FN_ATTRS 738 _mm_cmple_sd(__m128d __a, __m128d __b) 739 { 740 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 741 } 742 743 /// Compares the lower double-precision floating-point values in each of 744 /// the two 128-bit floating-point vectors of [2 x double] to determine if 745 /// the value in the first parameter is greater than the corresponding value 746 /// in the second parameter. 747 /// 748 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 749 /// 750 /// \headerfile <x86intrin.h> 751 /// 752 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 753 /// 754 /// \param __a 755 /// A 128-bit vector of [2 x double]. The lower double-precision value is 756 /// compared to the lower double-precision value of \a __b. 757 /// \param __b 758 /// A 128-bit vector of [2 x double]. The lower double-precision value is 759 /// compared to the lower double-precision value of \a __a. 760 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 761 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 762 static __inline__ __m128d __DEFAULT_FN_ATTRS 763 _mm_cmpgt_sd(__m128d __a, __m128d __b) 764 { 765 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 766 return __extension__ (__m128d) { __c[0], __a[1] }; 767 } 768 769 /// Compares the lower double-precision floating-point values in each of 770 /// the two 128-bit floating-point vectors of [2 x double] to determine if 771 /// the value in the first parameter is greater than or equal to the 772 /// corresponding value in the second parameter. 773 /// 774 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 775 /// 776 /// \headerfile <x86intrin.h> 777 /// 778 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 779 /// 780 /// \param __a 781 /// A 128-bit vector of [2 x double]. The lower double-precision value is 782 /// compared to the lower double-precision value of \a __b. 783 /// \param __b 784 /// A 128-bit vector of [2 x double]. The lower double-precision value is 785 /// compared to the lower double-precision value of \a __a. 786 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 787 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 788 static __inline__ __m128d __DEFAULT_FN_ATTRS 789 _mm_cmpge_sd(__m128d __a, __m128d __b) 790 { 791 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 792 return __extension__ (__m128d) { __c[0], __a[1] }; 793 } 794 795 /// Compares the lower double-precision floating-point values in each of 796 /// the two 128-bit floating-point vectors of [2 x double] to determine if 797 /// the value in the first parameter is "ordered" with respect to the 798 /// corresponding value in the second parameter. 799 /// 800 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 801 /// of double-precision values are "ordered" with respect to each other if 802 /// neither value is a NaN. 803 /// 804 /// \headerfile <x86intrin.h> 805 /// 806 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 807 /// 808 /// \param __a 809 /// A 128-bit vector of [2 x double]. The lower double-precision value is 810 /// compared to the lower double-precision value of \a __b. 811 /// \param __b 812 /// A 128-bit vector of [2 x double]. The lower double-precision value is 813 /// compared to the lower double-precision value of \a __a. 814 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 815 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 816 static __inline__ __m128d __DEFAULT_FN_ATTRS 817 _mm_cmpord_sd(__m128d __a, __m128d __b) 818 { 819 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 820 } 821 822 /// Compares the lower double-precision floating-point values in each of 823 /// the two 128-bit floating-point vectors of [2 x double] to determine if 824 /// the value in the first parameter is "unordered" with respect to the 825 /// corresponding value in the second parameter. 826 /// 827 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 828 /// of double-precision values are "unordered" with respect to each other if 829 /// one or both values are NaN. 830 /// 831 /// \headerfile <x86intrin.h> 832 /// 833 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 834 /// instruction. 835 /// 836 /// \param __a 837 /// A 128-bit vector of [2 x double]. The lower double-precision value is 838 /// compared to the lower double-precision value of \a __b. 839 /// \param __b 840 /// A 128-bit vector of [2 x double]. The lower double-precision value is 841 /// compared to the lower double-precision value of \a __a. 842 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 843 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 844 static __inline__ __m128d __DEFAULT_FN_ATTRS 845 _mm_cmpunord_sd(__m128d __a, __m128d __b) 846 { 847 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 848 } 849 850 /// Compares the lower double-precision floating-point values in each of 851 /// the two 128-bit floating-point vectors of [2 x double] to determine if 852 /// the value in the first parameter is unequal to the corresponding value in 853 /// the second parameter. 854 /// 855 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 856 /// 857 /// \headerfile <x86intrin.h> 858 /// 859 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 860 /// 861 /// \param __a 862 /// A 128-bit vector of [2 x double]. The lower double-precision value is 863 /// compared to the lower double-precision value of \a __b. 864 /// \param __b 865 /// A 128-bit vector of [2 x double]. The lower double-precision value is 866 /// compared to the lower double-precision value of \a __a. 867 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 868 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 869 static __inline__ __m128d __DEFAULT_FN_ATTRS 870 _mm_cmpneq_sd(__m128d __a, __m128d __b) 871 { 872 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 873 } 874 875 /// Compares the lower double-precision floating-point values in each of 876 /// the two 128-bit floating-point vectors of [2 x double] to determine if 877 /// the value in the first parameter is not less than the corresponding 878 /// value in the second parameter. 879 /// 880 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 881 /// 882 /// \headerfile <x86intrin.h> 883 /// 884 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 885 /// 886 /// \param __a 887 /// A 128-bit vector of [2 x double]. The lower double-precision value is 888 /// compared to the lower double-precision value of \a __b. 889 /// \param __b 890 /// A 128-bit vector of [2 x double]. The lower double-precision value is 891 /// compared to the lower double-precision value of \a __a. 892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 893 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 894 static __inline__ __m128d __DEFAULT_FN_ATTRS 895 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 896 { 897 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 898 } 899 900 /// Compares the lower double-precision floating-point values in each of 901 /// the two 128-bit floating-point vectors of [2 x double] to determine if 902 /// the value in the first parameter is not less than or equal to the 903 /// corresponding value in the second parameter. 904 /// 905 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 906 /// 907 /// \headerfile <x86intrin.h> 908 /// 909 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 910 /// 911 /// \param __a 912 /// A 128-bit vector of [2 x double]. The lower double-precision value is 913 /// compared to the lower double-precision value of \a __b. 914 /// \param __b 915 /// A 128-bit vector of [2 x double]. The lower double-precision value is 916 /// compared to the lower double-precision value of \a __a. 917 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 918 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 919 static __inline__ __m128d __DEFAULT_FN_ATTRS 920 _mm_cmpnle_sd(__m128d __a, __m128d __b) 921 { 922 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 923 } 924 925 /// Compares the lower double-precision floating-point values in each of 926 /// the two 128-bit floating-point vectors of [2 x double] to determine if 927 /// the value in the first parameter is not greater than the corresponding 928 /// value in the second parameter. 929 /// 930 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 931 /// 932 /// \headerfile <x86intrin.h> 933 /// 934 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 935 /// 936 /// \param __a 937 /// A 128-bit vector of [2 x double]. The lower double-precision value is 938 /// compared to the lower double-precision value of \a __b. 939 /// \param __b 940 /// A 128-bit vector of [2 x double]. The lower double-precision value is 941 /// compared to the lower double-precision value of \a __a. 942 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 943 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 944 static __inline__ __m128d __DEFAULT_FN_ATTRS 945 _mm_cmpngt_sd(__m128d __a, __m128d __b) 946 { 947 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 948 return __extension__ (__m128d) { __c[0], __a[1] }; 949 } 950 951 /// Compares the lower double-precision floating-point values in each of 952 /// the two 128-bit floating-point vectors of [2 x double] to determine if 953 /// the value in the first parameter is not greater than or equal to the 954 /// corresponding value in the second parameter. 955 /// 956 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 957 /// 958 /// \headerfile <x86intrin.h> 959 /// 960 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 961 /// 962 /// \param __a 963 /// A 128-bit vector of [2 x double]. The lower double-precision value is 964 /// compared to the lower double-precision value of \a __b. 965 /// \param __b 966 /// A 128-bit vector of [2 x double]. The lower double-precision value is 967 /// compared to the lower double-precision value of \a __a. 968 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 969 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 970 static __inline__ __m128d __DEFAULT_FN_ATTRS 971 _mm_cmpnge_sd(__m128d __a, __m128d __b) 972 { 973 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 974 return __extension__ (__m128d) { __c[0], __a[1] }; 975 } 976 977 /// Compares the lower double-precision floating-point values in each of 978 /// the two 128-bit floating-point vectors of [2 x double] for equality. 979 /// 980 /// The comparison yields 0 for false, 1 for true. If either of the two 981 /// lower double-precision values is NaN, 0 is returned. 982 /// 983 /// \headerfile <x86intrin.h> 984 /// 985 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 986 /// 987 /// \param __a 988 /// A 128-bit vector of [2 x double]. The lower double-precision value is 989 /// compared to the lower double-precision value of \a __b. 990 /// \param __b 991 /// A 128-bit vector of [2 x double]. The lower double-precision value is 992 /// compared to the lower double-precision value of \a __a. 993 /// \returns An integer containing the comparison results. If either of the two 994 /// lower double-precision values is NaN, 0 is returned. 995 static __inline__ int __DEFAULT_FN_ATTRS 996 _mm_comieq_sd(__m128d __a, __m128d __b) 997 { 998 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 999 } 1000 1001 /// Compares the lower double-precision floating-point values in each of 1002 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1003 /// the value in the first parameter is less than the corresponding value in 1004 /// the second parameter. 1005 /// 1006 /// The comparison yields 0 for false, 1 for true. If either of the two 1007 /// lower double-precision values is NaN, 0 is returned. 1008 /// 1009 /// \headerfile <x86intrin.h> 1010 /// 1011 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1012 /// 1013 /// \param __a 1014 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1015 /// compared to the lower double-precision value of \a __b. 1016 /// \param __b 1017 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1018 /// compared to the lower double-precision value of \a __a. 1019 /// \returns An integer containing the comparison results. If either of the two 1020 /// lower double-precision values is NaN, 0 is returned. 1021 static __inline__ int __DEFAULT_FN_ATTRS 1022 _mm_comilt_sd(__m128d __a, __m128d __b) 1023 { 1024 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1025 } 1026 1027 /// Compares the lower double-precision floating-point values in each of 1028 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1029 /// the value in the first parameter is less than or equal to the 1030 /// corresponding value in the second parameter. 1031 /// 1032 /// The comparison yields 0 for false, 1 for true. If either of the two 1033 /// lower double-precision values is NaN, 0 is returned. 1034 /// 1035 /// \headerfile <x86intrin.h> 1036 /// 1037 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1038 /// 1039 /// \param __a 1040 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1041 /// compared to the lower double-precision value of \a __b. 1042 /// \param __b 1043 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1044 /// compared to the lower double-precision value of \a __a. 1045 /// \returns An integer containing the comparison results. If either of the two 1046 /// lower double-precision values is NaN, 0 is returned. 1047 static __inline__ int __DEFAULT_FN_ATTRS 1048 _mm_comile_sd(__m128d __a, __m128d __b) 1049 { 1050 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1051 } 1052 1053 /// Compares the lower double-precision floating-point values in each of 1054 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1055 /// the value in the first parameter is greater than the corresponding value 1056 /// in the second parameter. 1057 /// 1058 /// The comparison yields 0 for false, 1 for true. If either of the two 1059 /// lower double-precision values is NaN, 0 is returned. 1060 /// 1061 /// \headerfile <x86intrin.h> 1062 /// 1063 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1064 /// 1065 /// \param __a 1066 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1067 /// compared to the lower double-precision value of \a __b. 1068 /// \param __b 1069 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1070 /// compared to the lower double-precision value of \a __a. 1071 /// \returns An integer containing the comparison results. If either of the two 1072 /// lower double-precision values is NaN, 0 is returned. 1073 static __inline__ int __DEFAULT_FN_ATTRS 1074 _mm_comigt_sd(__m128d __a, __m128d __b) 1075 { 1076 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1077 } 1078 1079 /// Compares the lower double-precision floating-point values in each of 1080 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1081 /// the value in the first parameter is greater than or equal to the 1082 /// corresponding value in the second parameter. 1083 /// 1084 /// The comparison yields 0 for false, 1 for true. If either of the two 1085 /// lower double-precision values is NaN, 0 is returned. 1086 /// 1087 /// \headerfile <x86intrin.h> 1088 /// 1089 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1090 /// 1091 /// \param __a 1092 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1093 /// compared to the lower double-precision value of \a __b. 1094 /// \param __b 1095 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1096 /// compared to the lower double-precision value of \a __a. 1097 /// \returns An integer containing the comparison results. If either of the two 1098 /// lower double-precision values is NaN, 0 is returned. 1099 static __inline__ int __DEFAULT_FN_ATTRS 1100 _mm_comige_sd(__m128d __a, __m128d __b) 1101 { 1102 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1103 } 1104 1105 /// Compares the lower double-precision floating-point values in each of 1106 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1107 /// the value in the first parameter is unequal to the corresponding value in 1108 /// the second parameter. 1109 /// 1110 /// The comparison yields 0 for false, 1 for true. If either of the two 1111 /// lower double-precision values is NaN, 1 is returned. 1112 /// 1113 /// \headerfile <x86intrin.h> 1114 /// 1115 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1116 /// 1117 /// \param __a 1118 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1119 /// compared to the lower double-precision value of \a __b. 1120 /// \param __b 1121 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1122 /// compared to the lower double-precision value of \a __a. 1123 /// \returns An integer containing the comparison results. If either of the two 1124 /// lower double-precision values is NaN, 1 is returned. 1125 static __inline__ int __DEFAULT_FN_ATTRS 1126 _mm_comineq_sd(__m128d __a, __m128d __b) 1127 { 1128 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1129 } 1130 1131 /// Compares the lower double-precision floating-point values in each of 1132 /// the two 128-bit floating-point vectors of [2 x double] for equality. The 1133 /// comparison yields 0 for false, 1 for true. 1134 /// 1135 /// If either of the two lower double-precision values is NaN, 0 is returned. 1136 /// 1137 /// \headerfile <x86intrin.h> 1138 /// 1139 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1140 /// 1141 /// \param __a 1142 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1143 /// compared to the lower double-precision value of \a __b. 1144 /// \param __b 1145 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1146 /// compared to the lower double-precision value of \a __a. 1147 /// \returns An integer containing the comparison results. If either of the two 1148 /// lower double-precision values is NaN, 0 is returned. 1149 static __inline__ int __DEFAULT_FN_ATTRS 1150 _mm_ucomieq_sd(__m128d __a, __m128d __b) 1151 { 1152 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1153 } 1154 1155 /// Compares the lower double-precision floating-point values in each of 1156 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1157 /// the value in the first parameter is less than the corresponding value in 1158 /// the second parameter. 1159 /// 1160 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1161 /// double-precision values is NaN, 0 is returned. 1162 /// 1163 /// \headerfile <x86intrin.h> 1164 /// 1165 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1166 /// 1167 /// \param __a 1168 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1169 /// compared to the lower double-precision value of \a __b. 1170 /// \param __b 1171 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1172 /// compared to the lower double-precision value of \a __a. 1173 /// \returns An integer containing the comparison results. If either of the two 1174 /// lower double-precision values is NaN, 0 is returned. 1175 static __inline__ int __DEFAULT_FN_ATTRS 1176 _mm_ucomilt_sd(__m128d __a, __m128d __b) 1177 { 1178 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1179 } 1180 1181 /// Compares the lower double-precision floating-point values in each of 1182 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1183 /// the value in the first parameter is less than or equal to the 1184 /// corresponding value in the second parameter. 1185 /// 1186 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1187 /// double-precision values is NaN, 0 is returned. 1188 /// 1189 /// \headerfile <x86intrin.h> 1190 /// 1191 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1192 /// 1193 /// \param __a 1194 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1195 /// compared to the lower double-precision value of \a __b. 1196 /// \param __b 1197 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1198 /// compared to the lower double-precision value of \a __a. 1199 /// \returns An integer containing the comparison results. If either of the two 1200 /// lower double-precision values is NaN, 0 is returned. 1201 static __inline__ int __DEFAULT_FN_ATTRS 1202 _mm_ucomile_sd(__m128d __a, __m128d __b) 1203 { 1204 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1205 } 1206 1207 /// Compares the lower double-precision floating-point values in each of 1208 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1209 /// the value in the first parameter is greater than the corresponding value 1210 /// in the second parameter. 1211 /// 1212 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1213 /// double-precision values is NaN, 0 is returned. 1214 /// 1215 /// \headerfile <x86intrin.h> 1216 /// 1217 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1218 /// 1219 /// \param __a 1220 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1221 /// compared to the lower double-precision value of \a __b. 1222 /// \param __b 1223 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1224 /// compared to the lower double-precision value of \a __a. 1225 /// \returns An integer containing the comparison results. If either of the two 1226 /// lower double-precision values is NaN, 0 is returned. 1227 static __inline__ int __DEFAULT_FN_ATTRS 1228 _mm_ucomigt_sd(__m128d __a, __m128d __b) 1229 { 1230 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1231 } 1232 1233 /// Compares the lower double-precision floating-point values in each of 1234 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1235 /// the value in the first parameter is greater than or equal to the 1236 /// corresponding value in the second parameter. 1237 /// 1238 /// The comparison yields 0 for false, 1 for true. If either of the two 1239 /// lower double-precision values is NaN, 0 is returned. 1240 /// 1241 /// \headerfile <x86intrin.h> 1242 /// 1243 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1244 /// 1245 /// \param __a 1246 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1247 /// compared to the lower double-precision value of \a __b. 1248 /// \param __b 1249 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1250 /// compared to the lower double-precision value of \a __a. 1251 /// \returns An integer containing the comparison results. If either of the two 1252 /// lower double-precision values is NaN, 0 is returned. 1253 static __inline__ int __DEFAULT_FN_ATTRS 1254 _mm_ucomige_sd(__m128d __a, __m128d __b) 1255 { 1256 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1257 } 1258 1259 /// Compares the lower double-precision floating-point values in each of 1260 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1261 /// the value in the first parameter is unequal to the corresponding value in 1262 /// the second parameter. 1263 /// 1264 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1265 /// double-precision values is NaN, 1 is returned. 1266 /// 1267 /// \headerfile <x86intrin.h> 1268 /// 1269 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1270 /// 1271 /// \param __a 1272 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1273 /// compared to the lower double-precision value of \a __b. 1274 /// \param __b 1275 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1276 /// compared to the lower double-precision value of \a __a. 1277 /// \returns An integer containing the comparison result. If either of the two 1278 /// lower double-precision values is NaN, 1 is returned. 1279 static __inline__ int __DEFAULT_FN_ATTRS 1280 _mm_ucomineq_sd(__m128d __a, __m128d __b) 1281 { 1282 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1283 } 1284 1285 /// Converts the two double-precision floating-point elements of a 1286 /// 128-bit vector of [2 x double] into two single-precision floating-point 1287 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1288 /// The upper 64 bits of the result vector are set to zero. 1289 /// 1290 /// \headerfile <x86intrin.h> 1291 /// 1292 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1293 /// 1294 /// \param __a 1295 /// A 128-bit vector of [2 x double]. 1296 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1297 /// converted values. The upper 64 bits are set to zero. 1298 static __inline__ __m128 __DEFAULT_FN_ATTRS 1299 _mm_cvtpd_ps(__m128d __a) 1300 { 1301 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1302 } 1303 1304 /// Converts the lower two single-precision floating-point elements of a 1305 /// 128-bit vector of [4 x float] into two double-precision floating-point 1306 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1307 /// elements of the input vector are unused. 1308 /// 1309 /// \headerfile <x86intrin.h> 1310 /// 1311 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1312 /// 1313 /// \param __a 1314 /// A 128-bit vector of [4 x float]. The lower two single-precision 1315 /// floating-point elements are converted to double-precision values. The 1316 /// upper two elements are unused. 1317 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1318 static __inline__ __m128d __DEFAULT_FN_ATTRS 1319 _mm_cvtps_pd(__m128 __a) 1320 { 1321 return (__m128d) __builtin_convertvector( 1322 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1323 } 1324 1325 /// Converts the lower two integer elements of a 128-bit vector of 1326 /// [4 x i32] into two double-precision floating-point values, returned in a 1327 /// 128-bit vector of [2 x double]. 1328 /// 1329 /// The upper two elements of the input vector are unused. 1330 /// 1331 /// \headerfile <x86intrin.h> 1332 /// 1333 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1334 /// 1335 /// \param __a 1336 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1337 /// converted to double-precision values. 1338 /// 1339 /// The upper two elements are unused. 1340 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1341 static __inline__ __m128d __DEFAULT_FN_ATTRS 1342 _mm_cvtepi32_pd(__m128i __a) 1343 { 1344 return (__m128d) __builtin_convertvector( 1345 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1346 } 1347 1348 /// Converts the two double-precision floating-point elements of a 1349 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1350 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1351 /// 64 bits of the result vector are set to zero. 1352 /// 1353 /// \headerfile <x86intrin.h> 1354 /// 1355 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1356 /// 1357 /// \param __a 1358 /// A 128-bit vector of [2 x double]. 1359 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1360 /// converted values. The upper 64 bits are set to zero. 1361 static __inline__ __m128i __DEFAULT_FN_ATTRS 1362 _mm_cvtpd_epi32(__m128d __a) 1363 { 1364 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1365 } 1366 1367 /// Converts the low-order element of a 128-bit vector of [2 x double] 1368 /// into a 32-bit signed integer value. 1369 /// 1370 /// \headerfile <x86intrin.h> 1371 /// 1372 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1373 /// 1374 /// \param __a 1375 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1376 /// conversion. 1377 /// \returns A 32-bit signed integer containing the converted value. 1378 static __inline__ int __DEFAULT_FN_ATTRS 1379 _mm_cvtsd_si32(__m128d __a) 1380 { 1381 return __builtin_ia32_cvtsd2si((__v2df)__a); 1382 } 1383 1384 /// Converts the lower double-precision floating-point element of a 1385 /// 128-bit vector of [2 x double], in the second parameter, into a 1386 /// single-precision floating-point value, returned in the lower 32 bits of a 1387 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1388 /// copied from the upper 96 bits of the first parameter. 1389 /// 1390 /// \headerfile <x86intrin.h> 1391 /// 1392 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1393 /// 1394 /// \param __a 1395 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1396 /// copied to the upper 96 bits of the result. 1397 /// \param __b 1398 /// A 128-bit vector of [2 x double]. The lower double-precision 1399 /// floating-point element is used in the conversion. 1400 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1401 /// converted value from the second parameter. The upper 96 bits are copied 1402 /// from the upper 96 bits of the first parameter. 1403 static __inline__ __m128 __DEFAULT_FN_ATTRS 1404 _mm_cvtsd_ss(__m128 __a, __m128d __b) 1405 { 1406 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1407 } 1408 1409 /// Converts a 32-bit signed integer value, in the second parameter, into 1410 /// a double-precision floating-point value, returned in the lower 64 bits of 1411 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1412 /// are copied from the upper 64 bits of the first parameter. 1413 /// 1414 /// \headerfile <x86intrin.h> 1415 /// 1416 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1417 /// 1418 /// \param __a 1419 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1420 /// copied to the upper 64 bits of the result. 1421 /// \param __b 1422 /// A 32-bit signed integer containing the value to be converted. 1423 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1424 /// converted value from the second parameter. The upper 64 bits are copied 1425 /// from the upper 64 bits of the first parameter. 1426 static __inline__ __m128d __DEFAULT_FN_ATTRS 1427 _mm_cvtsi32_sd(__m128d __a, int __b) 1428 { 1429 __a[0] = __b; 1430 return __a; 1431 } 1432 1433 /// Converts the lower single-precision floating-point element of a 1434 /// 128-bit vector of [4 x float], in the second parameter, into a 1435 /// double-precision floating-point value, returned in the lower 64 bits of 1436 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1437 /// are copied from the upper 64 bits of the first parameter. 1438 /// 1439 /// \headerfile <x86intrin.h> 1440 /// 1441 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1442 /// 1443 /// \param __a 1444 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1445 /// copied to the upper 64 bits of the result. 1446 /// \param __b 1447 /// A 128-bit vector of [4 x float]. The lower single-precision 1448 /// floating-point element is used in the conversion. 1449 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1450 /// converted value from the second parameter. The upper 64 bits are copied 1451 /// from the upper 64 bits of the first parameter. 1452 static __inline__ __m128d __DEFAULT_FN_ATTRS 1453 _mm_cvtss_sd(__m128d __a, __m128 __b) 1454 { 1455 __a[0] = __b[0]; 1456 return __a; 1457 } 1458 1459 /// Converts the two double-precision floating-point elements of a 1460 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1461 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. 1462 /// 1463 /// If the result of either conversion is inexact, the result is truncated 1464 /// (rounded towards zero) regardless of the current MXCSR setting. The upper 1465 /// 64 bits of the result vector are set to zero. 1466 /// 1467 /// \headerfile <x86intrin.h> 1468 /// 1469 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1470 /// instruction. 1471 /// 1472 /// \param __a 1473 /// A 128-bit vector of [2 x double]. 1474 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1475 /// converted values. The upper 64 bits are set to zero. 1476 static __inline__ __m128i __DEFAULT_FN_ATTRS 1477 _mm_cvttpd_epi32(__m128d __a) 1478 { 1479 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1480 } 1481 1482 /// Converts the low-order element of a [2 x double] vector into a 32-bit 1483 /// signed integer value, truncating the result when it is inexact. 1484 /// 1485 /// \headerfile <x86intrin.h> 1486 /// 1487 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1488 /// instruction. 1489 /// 1490 /// \param __a 1491 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1492 /// conversion. 1493 /// \returns A 32-bit signed integer containing the converted value. 1494 static __inline__ int __DEFAULT_FN_ATTRS 1495 _mm_cvttsd_si32(__m128d __a) 1496 { 1497 return __builtin_ia32_cvttsd2si((__v2df)__a); 1498 } 1499 1500 /// Converts the two double-precision floating-point elements of a 1501 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1502 /// returned in a 64-bit vector of [2 x i32]. 1503 /// 1504 /// \headerfile <x86intrin.h> 1505 /// 1506 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1507 /// 1508 /// \param __a 1509 /// A 128-bit vector of [2 x double]. 1510 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1511 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1512 _mm_cvtpd_pi32(__m128d __a) 1513 { 1514 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1515 } 1516 1517 /// Converts the two double-precision floating-point elements of a 1518 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1519 /// returned in a 64-bit vector of [2 x i32]. 1520 /// 1521 /// If the result of either conversion is inexact, the result is truncated 1522 /// (rounded towards zero) regardless of the current MXCSR setting. 1523 /// 1524 /// \headerfile <x86intrin.h> 1525 /// 1526 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1527 /// 1528 /// \param __a 1529 /// A 128-bit vector of [2 x double]. 1530 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1531 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1532 _mm_cvttpd_pi32(__m128d __a) 1533 { 1534 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1535 } 1536 1537 /// Converts the two signed 32-bit integer elements of a 64-bit vector of 1538 /// [2 x i32] into two double-precision floating-point values, returned in a 1539 /// 128-bit vector of [2 x double]. 1540 /// 1541 /// \headerfile <x86intrin.h> 1542 /// 1543 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1544 /// 1545 /// \param __a 1546 /// A 64-bit vector of [2 x i32]. 1547 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1548 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX 1549 _mm_cvtpi32_pd(__m64 __a) 1550 { 1551 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1552 } 1553 1554 /// Returns the low-order element of a 128-bit vector of [2 x double] as 1555 /// a double-precision floating-point value. 1556 /// 1557 /// \headerfile <x86intrin.h> 1558 /// 1559 /// This intrinsic has no corresponding instruction. 1560 /// 1561 /// \param __a 1562 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1563 /// \returns A double-precision floating-point value copied from the lower 64 1564 /// bits of \a __a. 1565 static __inline__ double __DEFAULT_FN_ATTRS 1566 _mm_cvtsd_f64(__m128d __a) 1567 { 1568 return __a[0]; 1569 } 1570 1571 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1572 /// memory location. 1573 /// 1574 /// \headerfile <x86intrin.h> 1575 /// 1576 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1577 /// 1578 /// \param __dp 1579 /// A pointer to a 128-bit memory location. The address of the memory 1580 /// location has to be 16-byte aligned. 1581 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1582 static __inline__ __m128d __DEFAULT_FN_ATTRS 1583 _mm_load_pd(double const *__dp) 1584 { 1585 return *(const __m128d*)__dp; 1586 } 1587 1588 /// Loads a double-precision floating-point value from a specified memory 1589 /// location and duplicates it to both vector elements of a 128-bit vector of 1590 /// [2 x double]. 1591 /// 1592 /// \headerfile <x86intrin.h> 1593 /// 1594 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1595 /// 1596 /// \param __dp 1597 /// A pointer to a memory location containing a double-precision value. 1598 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1599 /// duplicated values. 1600 static __inline__ __m128d __DEFAULT_FN_ATTRS 1601 _mm_load1_pd(double const *__dp) 1602 { 1603 struct __mm_load1_pd_struct { 1604 double __u; 1605 } __attribute__((__packed__, __may_alias__)); 1606 double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u; 1607 return __extension__ (__m128d){ __u, __u }; 1608 } 1609 1610 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1611 1612 /// Loads two double-precision values, in reverse order, from an aligned 1613 /// memory location into a 128-bit vector of [2 x double]. 1614 /// 1615 /// \headerfile <x86intrin.h> 1616 /// 1617 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1618 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1619 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1620 /// 1621 /// \param __dp 1622 /// A 16-byte aligned pointer to an array of double-precision values to be 1623 /// loaded in reverse order. 1624 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1625 /// values. 1626 static __inline__ __m128d __DEFAULT_FN_ATTRS 1627 _mm_loadr_pd(double const *__dp) 1628 { 1629 __m128d __u = *(const __m128d*)__dp; 1630 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1631 } 1632 1633 /// Loads a 128-bit floating-point vector of [2 x double] from an 1634 /// unaligned memory location. 1635 /// 1636 /// \headerfile <x86intrin.h> 1637 /// 1638 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1639 /// 1640 /// \param __dp 1641 /// A pointer to a 128-bit memory location. The address of the memory 1642 /// location does not have to be aligned. 1643 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1644 static __inline__ __m128d __DEFAULT_FN_ATTRS 1645 _mm_loadu_pd(double const *__dp) 1646 { 1647 struct __loadu_pd { 1648 __m128d_u __v; 1649 } __attribute__((__packed__, __may_alias__)); 1650 return ((const struct __loadu_pd*)__dp)->__v; 1651 } 1652 1653 /// Loads a 64-bit integer value to the low element of a 128-bit integer 1654 /// vector and clears the upper element. 1655 /// 1656 /// \headerfile <x86intrin.h> 1657 /// 1658 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1659 /// 1660 /// \param __a 1661 /// A pointer to a 64-bit memory location. The address of the memory 1662 /// location does not have to be aligned. 1663 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1664 static __inline__ __m128i __DEFAULT_FN_ATTRS 1665 _mm_loadu_si64(void const *__a) 1666 { 1667 struct __loadu_si64 { 1668 long long __v; 1669 } __attribute__((__packed__, __may_alias__)); 1670 long long __u = ((const struct __loadu_si64*)__a)->__v; 1671 return __extension__ (__m128i)(__v2di){__u, 0LL}; 1672 } 1673 1674 /// Loads a 32-bit integer value to the low element of a 128-bit integer 1675 /// vector and clears the upper element. 1676 /// 1677 /// \headerfile <x86intrin.h> 1678 /// 1679 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1680 /// 1681 /// \param __a 1682 /// A pointer to a 32-bit memory location. The address of the memory 1683 /// location does not have to be aligned. 1684 /// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1685 static __inline__ __m128i __DEFAULT_FN_ATTRS 1686 _mm_loadu_si32(void const *__a) 1687 { 1688 struct __loadu_si32 { 1689 int __v; 1690 } __attribute__((__packed__, __may_alias__)); 1691 int __u = ((const struct __loadu_si32*)__a)->__v; 1692 return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; 1693 } 1694 1695 /// Loads a 16-bit integer value to the low element of a 128-bit integer 1696 /// vector and clears the upper element. 1697 /// 1698 /// \headerfile <x86intrin.h> 1699 /// 1700 /// This intrinsic does not correspond to a specific instruction. 1701 /// 1702 /// \param __a 1703 /// A pointer to a 16-bit memory location. The address of the memory 1704 /// location does not have to be aligned. 1705 /// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1706 static __inline__ __m128i __DEFAULT_FN_ATTRS 1707 _mm_loadu_si16(void const *__a) 1708 { 1709 struct __loadu_si16 { 1710 short __v; 1711 } __attribute__((__packed__, __may_alias__)); 1712 short __u = ((const struct __loadu_si16*)__a)->__v; 1713 return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1714 } 1715 1716 /// Loads a 64-bit double-precision value to the low element of a 1717 /// 128-bit integer vector and clears the upper element. 1718 /// 1719 /// \headerfile <x86intrin.h> 1720 /// 1721 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1722 /// 1723 /// \param __dp 1724 /// A pointer to a memory location containing a double-precision value. 1725 /// The address of the memory location does not have to be aligned. 1726 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1727 static __inline__ __m128d __DEFAULT_FN_ATTRS 1728 _mm_load_sd(double const *__dp) 1729 { 1730 struct __mm_load_sd_struct { 1731 double __u; 1732 } __attribute__((__packed__, __may_alias__)); 1733 double __u = ((const struct __mm_load_sd_struct*)__dp)->__u; 1734 return __extension__ (__m128d){ __u, 0 }; 1735 } 1736 1737 /// Loads a double-precision value into the high-order bits of a 128-bit 1738 /// vector of [2 x double]. The low-order bits are copied from the low-order 1739 /// bits of the first operand. 1740 /// 1741 /// \headerfile <x86intrin.h> 1742 /// 1743 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1744 /// 1745 /// \param __a 1746 /// A 128-bit vector of [2 x double]. \n 1747 /// Bits [63:0] are written to bits [63:0] of the result. 1748 /// \param __dp 1749 /// A pointer to a 64-bit memory location containing a double-precision 1750 /// floating-point value that is loaded. The loaded value is written to bits 1751 /// [127:64] of the result. The address of the memory location does not have 1752 /// to be aligned. 1753 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1754 static __inline__ __m128d __DEFAULT_FN_ATTRS 1755 _mm_loadh_pd(__m128d __a, double const *__dp) 1756 { 1757 struct __mm_loadh_pd_struct { 1758 double __u; 1759 } __attribute__((__packed__, __may_alias__)); 1760 double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u; 1761 return __extension__ (__m128d){ __a[0], __u }; 1762 } 1763 1764 /// Loads a double-precision value into the low-order bits of a 128-bit 1765 /// vector of [2 x double]. The high-order bits are copied from the 1766 /// high-order bits of the first operand. 1767 /// 1768 /// \headerfile <x86intrin.h> 1769 /// 1770 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1771 /// 1772 /// \param __a 1773 /// A 128-bit vector of [2 x double]. \n 1774 /// Bits [127:64] are written to bits [127:64] of the result. 1775 /// \param __dp 1776 /// A pointer to a 64-bit memory location containing a double-precision 1777 /// floating-point value that is loaded. The loaded value is written to bits 1778 /// [63:0] of the result. The address of the memory location does not have to 1779 /// be aligned. 1780 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1781 static __inline__ __m128d __DEFAULT_FN_ATTRS 1782 _mm_loadl_pd(__m128d __a, double const *__dp) 1783 { 1784 struct __mm_loadl_pd_struct { 1785 double __u; 1786 } __attribute__((__packed__, __may_alias__)); 1787 double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u; 1788 return __extension__ (__m128d){ __u, __a[1] }; 1789 } 1790 1791 /// Constructs a 128-bit floating-point vector of [2 x double] with 1792 /// unspecified content. This could be used as an argument to another 1793 /// intrinsic function where the argument is required but the value is not 1794 /// actually used. 1795 /// 1796 /// \headerfile <x86intrin.h> 1797 /// 1798 /// This intrinsic has no corresponding instruction. 1799 /// 1800 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1801 /// content. 1802 static __inline__ __m128d __DEFAULT_FN_ATTRS 1803 _mm_undefined_pd(void) 1804 { 1805 return (__m128d)__builtin_ia32_undef128(); 1806 } 1807 1808 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1809 /// 64 bits of the vector are initialized with the specified double-precision 1810 /// floating-point value. The upper 64 bits are set to zero. 1811 /// 1812 /// \headerfile <x86intrin.h> 1813 /// 1814 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1815 /// 1816 /// \param __w 1817 /// A double-precision floating-point value used to initialize the lower 64 1818 /// bits of the result. 1819 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1820 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1821 /// set to zero. 1822 static __inline__ __m128d __DEFAULT_FN_ATTRS 1823 _mm_set_sd(double __w) 1824 { 1825 return __extension__ (__m128d){ __w, 0 }; 1826 } 1827 1828 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1829 /// of the two double-precision floating-point vector elements set to the 1830 /// specified double-precision floating-point value. 1831 /// 1832 /// \headerfile <x86intrin.h> 1833 /// 1834 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1835 /// 1836 /// \param __w 1837 /// A double-precision floating-point value used to initialize each vector 1838 /// element of the result. 1839 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1840 static __inline__ __m128d __DEFAULT_FN_ATTRS 1841 _mm_set1_pd(double __w) 1842 { 1843 return __extension__ (__m128d){ __w, __w }; 1844 } 1845 1846 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1847 /// of the two double-precision floating-point vector elements set to the 1848 /// specified double-precision floating-point value. 1849 /// 1850 /// \headerfile <x86intrin.h> 1851 /// 1852 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1853 /// 1854 /// \param __w 1855 /// A double-precision floating-point value used to initialize each vector 1856 /// element of the result. 1857 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1858 static __inline__ __m128d __DEFAULT_FN_ATTRS 1859 _mm_set_pd1(double __w) 1860 { 1861 return _mm_set1_pd(__w); 1862 } 1863 1864 /// Constructs a 128-bit floating-point vector of [2 x double] 1865 /// initialized with the specified double-precision floating-point values. 1866 /// 1867 /// \headerfile <x86intrin.h> 1868 /// 1869 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1870 /// 1871 /// \param __w 1872 /// A double-precision floating-point value used to initialize the upper 64 1873 /// bits of the result. 1874 /// \param __x 1875 /// A double-precision floating-point value used to initialize the lower 64 1876 /// bits of the result. 1877 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1878 static __inline__ __m128d __DEFAULT_FN_ATTRS 1879 _mm_set_pd(double __w, double __x) 1880 { 1881 return __extension__ (__m128d){ __x, __w }; 1882 } 1883 1884 /// Constructs a 128-bit floating-point vector of [2 x double], 1885 /// initialized in reverse order with the specified double-precision 1886 /// floating-point values. 1887 /// 1888 /// \headerfile <x86intrin.h> 1889 /// 1890 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1891 /// 1892 /// \param __w 1893 /// A double-precision floating-point value used to initialize the lower 64 1894 /// bits of the result. 1895 /// \param __x 1896 /// A double-precision floating-point value used to initialize the upper 64 1897 /// bits of the result. 1898 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1899 static __inline__ __m128d __DEFAULT_FN_ATTRS 1900 _mm_setr_pd(double __w, double __x) 1901 { 1902 return __extension__ (__m128d){ __w, __x }; 1903 } 1904 1905 /// Constructs a 128-bit floating-point vector of [2 x double] 1906 /// initialized to zero. 1907 /// 1908 /// \headerfile <x86intrin.h> 1909 /// 1910 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1911 /// 1912 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1913 /// all elements set to zero. 1914 static __inline__ __m128d __DEFAULT_FN_ATTRS 1915 _mm_setzero_pd(void) 1916 { 1917 return __extension__ (__m128d){ 0, 0 }; 1918 } 1919 1920 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1921 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1922 /// 64 bits are set to the upper 64 bits of the first parameter. 1923 /// 1924 /// \headerfile <x86intrin.h> 1925 /// 1926 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1927 /// 1928 /// \param __a 1929 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1930 /// upper 64 bits of the result. 1931 /// \param __b 1932 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1933 /// lower 64 bits of the result. 1934 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1935 static __inline__ __m128d __DEFAULT_FN_ATTRS 1936 _mm_move_sd(__m128d __a, __m128d __b) 1937 { 1938 __a[0] = __b[0]; 1939 return __a; 1940 } 1941 1942 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1943 /// memory location. 1944 /// 1945 /// \headerfile <x86intrin.h> 1946 /// 1947 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1948 /// 1949 /// \param __dp 1950 /// A pointer to a 64-bit memory location. 1951 /// \param __a 1952 /// A 128-bit vector of [2 x double] containing the value to be stored. 1953 static __inline__ void __DEFAULT_FN_ATTRS 1954 _mm_store_sd(double *__dp, __m128d __a) 1955 { 1956 struct __mm_store_sd_struct { 1957 double __u; 1958 } __attribute__((__packed__, __may_alias__)); 1959 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 1960 } 1961 1962 /// Moves packed double-precision values from a 128-bit vector of 1963 /// [2 x double] to a memory location. 1964 /// 1965 /// \headerfile <x86intrin.h> 1966 /// 1967 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1968 /// 1969 /// \param __dp 1970 /// A pointer to an aligned memory location that can store two 1971 /// double-precision values. 1972 /// \param __a 1973 /// A packed 128-bit vector of [2 x double] containing the values to be 1974 /// moved. 1975 static __inline__ void __DEFAULT_FN_ATTRS 1976 _mm_store_pd(double *__dp, __m128d __a) 1977 { 1978 *(__m128d*)__dp = __a; 1979 } 1980 1981 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1982 /// the upper and lower 64 bits of a memory location. 1983 /// 1984 /// \headerfile <x86intrin.h> 1985 /// 1986 /// This intrinsic corresponds to the 1987 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1988 /// 1989 /// \param __dp 1990 /// A pointer to a memory location that can store two double-precision 1991 /// values. 1992 /// \param __a 1993 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1994 /// of the values in \a __dp. 1995 static __inline__ void __DEFAULT_FN_ATTRS 1996 _mm_store1_pd(double *__dp, __m128d __a) 1997 { 1998 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1999 _mm_store_pd(__dp, __a); 2000 } 2001 2002 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 2003 /// the upper and lower 64 bits of a memory location. 2004 /// 2005 /// \headerfile <x86intrin.h> 2006 /// 2007 /// This intrinsic corresponds to the 2008 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 2009 /// 2010 /// \param __dp 2011 /// A pointer to a memory location that can store two double-precision 2012 /// values. 2013 /// \param __a 2014 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 2015 /// of the values in \a __dp. 2016 static __inline__ void __DEFAULT_FN_ATTRS 2017 _mm_store_pd1(double *__dp, __m128d __a) 2018 { 2019 _mm_store1_pd(__dp, __a); 2020 } 2021 2022 /// Stores a 128-bit vector of [2 x double] into an unaligned memory 2023 /// location. 2024 /// 2025 /// \headerfile <x86intrin.h> 2026 /// 2027 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 2028 /// 2029 /// \param __dp 2030 /// A pointer to a 128-bit memory location. The address of the memory 2031 /// location does not have to be aligned. 2032 /// \param __a 2033 /// A 128-bit vector of [2 x double] containing the values to be stored. 2034 static __inline__ void __DEFAULT_FN_ATTRS 2035 _mm_storeu_pd(double *__dp, __m128d __a) 2036 { 2037 struct __storeu_pd { 2038 __m128d_u __v; 2039 } __attribute__((__packed__, __may_alias__)); 2040 ((struct __storeu_pd*)__dp)->__v = __a; 2041 } 2042 2043 /// Stores two double-precision values, in reverse order, from a 128-bit 2044 /// vector of [2 x double] to a 16-byte aligned memory location. 2045 /// 2046 /// \headerfile <x86intrin.h> 2047 /// 2048 /// This intrinsic corresponds to a shuffling instruction followed by a 2049 /// <c> VMOVAPD / MOVAPD </c> instruction. 2050 /// 2051 /// \param __dp 2052 /// A pointer to a 16-byte aligned memory location that can store two 2053 /// double-precision values. 2054 /// \param __a 2055 /// A 128-bit vector of [2 x double] containing the values to be reversed and 2056 /// stored. 2057 static __inline__ void __DEFAULT_FN_ATTRS 2058 _mm_storer_pd(double *__dp, __m128d __a) 2059 { 2060 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 2061 *(__m128d *)__dp = __a; 2062 } 2063 2064 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 2065 /// memory location. 2066 /// 2067 /// \headerfile <x86intrin.h> 2068 /// 2069 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 2070 /// 2071 /// \param __dp 2072 /// A pointer to a 64-bit memory location. 2073 /// \param __a 2074 /// A 128-bit vector of [2 x double] containing the value to be stored. 2075 static __inline__ void __DEFAULT_FN_ATTRS 2076 _mm_storeh_pd(double *__dp, __m128d __a) 2077 { 2078 struct __mm_storeh_pd_struct { 2079 double __u; 2080 } __attribute__((__packed__, __may_alias__)); 2081 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 2082 } 2083 2084 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 2085 /// memory location. 2086 /// 2087 /// \headerfile <x86intrin.h> 2088 /// 2089 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 2090 /// 2091 /// \param __dp 2092 /// A pointer to a 64-bit memory location. 2093 /// \param __a 2094 /// A 128-bit vector of [2 x double] containing the value to be stored. 2095 static __inline__ void __DEFAULT_FN_ATTRS 2096 _mm_storel_pd(double *__dp, __m128d __a) 2097 { 2098 struct __mm_storeh_pd_struct { 2099 double __u; 2100 } __attribute__((__packed__, __may_alias__)); 2101 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 2102 } 2103 2104 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2105 /// saving the lower 8 bits of each sum in the corresponding element of a 2106 /// 128-bit result vector of [16 x i8]. 2107 /// 2108 /// The integer elements of both parameters can be either signed or unsigned. 2109 /// 2110 /// \headerfile <x86intrin.h> 2111 /// 2112 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2113 /// 2114 /// \param __a 2115 /// A 128-bit vector of [16 x i8]. 2116 /// \param __b 2117 /// A 128-bit vector of [16 x i8]. 2118 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2119 /// parameters. 2120 static __inline__ __m128i __DEFAULT_FN_ATTRS 2121 _mm_add_epi8(__m128i __a, __m128i __b) 2122 { 2123 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2124 } 2125 2126 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2127 /// saving the lower 16 bits of each sum in the corresponding element of a 2128 /// 128-bit result vector of [8 x i16]. 2129 /// 2130 /// The integer elements of both parameters can be either signed or unsigned. 2131 /// 2132 /// \headerfile <x86intrin.h> 2133 /// 2134 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2135 /// 2136 /// \param __a 2137 /// A 128-bit vector of [8 x i16]. 2138 /// \param __b 2139 /// A 128-bit vector of [8 x i16]. 2140 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2141 /// parameters. 2142 static __inline__ __m128i __DEFAULT_FN_ATTRS 2143 _mm_add_epi16(__m128i __a, __m128i __b) 2144 { 2145 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2146 } 2147 2148 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2149 /// saving the lower 32 bits of each sum in the corresponding element of a 2150 /// 128-bit result vector of [4 x i32]. 2151 /// 2152 /// The integer elements of both parameters can be either signed or unsigned. 2153 /// 2154 /// \headerfile <x86intrin.h> 2155 /// 2156 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2157 /// 2158 /// \param __a 2159 /// A 128-bit vector of [4 x i32]. 2160 /// \param __b 2161 /// A 128-bit vector of [4 x i32]. 2162 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2163 /// parameters. 2164 static __inline__ __m128i __DEFAULT_FN_ATTRS 2165 _mm_add_epi32(__m128i __a, __m128i __b) 2166 { 2167 return (__m128i)((__v4su)__a + (__v4su)__b); 2168 } 2169 2170 /// Adds two signed or unsigned 64-bit integer values, returning the 2171 /// lower 64 bits of the sum. 2172 /// 2173 /// \headerfile <x86intrin.h> 2174 /// 2175 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2176 /// 2177 /// \param __a 2178 /// A 64-bit integer. 2179 /// \param __b 2180 /// A 64-bit integer. 2181 /// \returns A 64-bit integer containing the sum of both parameters. 2182 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2183 _mm_add_si64(__m64 __a, __m64 __b) 2184 { 2185 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2186 } 2187 2188 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2189 /// saving the lower 64 bits of each sum in the corresponding element of a 2190 /// 128-bit result vector of [2 x i64]. 2191 /// 2192 /// The integer elements of both parameters can be either signed or unsigned. 2193 /// 2194 /// \headerfile <x86intrin.h> 2195 /// 2196 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2197 /// 2198 /// \param __a 2199 /// A 128-bit vector of [2 x i64]. 2200 /// \param __b 2201 /// A 128-bit vector of [2 x i64]. 2202 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2203 /// parameters. 2204 static __inline__ __m128i __DEFAULT_FN_ATTRS 2205 _mm_add_epi64(__m128i __a, __m128i __b) 2206 { 2207 return (__m128i)((__v2du)__a + (__v2du)__b); 2208 } 2209 2210 /// Adds, with saturation, the corresponding elements of two 128-bit 2211 /// signed [16 x i8] vectors, saving each sum in the corresponding element of 2212 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are 2213 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. 2214 /// 2215 /// \headerfile <x86intrin.h> 2216 /// 2217 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2218 /// 2219 /// \param __a 2220 /// A 128-bit signed [16 x i8] vector. 2221 /// \param __b 2222 /// A 128-bit signed [16 x i8] vector. 2223 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2224 /// both parameters. 2225 static __inline__ __m128i __DEFAULT_FN_ATTRS 2226 _mm_adds_epi8(__m128i __a, __m128i __b) 2227 { 2228 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 2229 } 2230 2231 /// Adds, with saturation, the corresponding elements of two 128-bit 2232 /// signed [8 x i16] vectors, saving each sum in the corresponding element of 2233 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF 2234 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to 2235 /// 0x8000. 2236 /// 2237 /// \headerfile <x86intrin.h> 2238 /// 2239 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2240 /// 2241 /// \param __a 2242 /// A 128-bit signed [8 x i16] vector. 2243 /// \param __b 2244 /// A 128-bit signed [8 x i16] vector. 2245 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2246 /// both parameters. 2247 static __inline__ __m128i __DEFAULT_FN_ATTRS 2248 _mm_adds_epi16(__m128i __a, __m128i __b) 2249 { 2250 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 2251 } 2252 2253 /// Adds, with saturation, the corresponding elements of two 128-bit 2254 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2255 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF 2256 /// are saturated to 0xFF. Negative sums are saturated to 0x00. 2257 /// 2258 /// \headerfile <x86intrin.h> 2259 /// 2260 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2261 /// 2262 /// \param __a 2263 /// A 128-bit unsigned [16 x i8] vector. 2264 /// \param __b 2265 /// A 128-bit unsigned [16 x i8] vector. 2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2267 /// of both parameters. 2268 static __inline__ __m128i __DEFAULT_FN_ATTRS 2269 _mm_adds_epu8(__m128i __a, __m128i __b) 2270 { 2271 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 2272 } 2273 2274 /// Adds, with saturation, the corresponding elements of two 128-bit 2275 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2276 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than 2277 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. 2278 /// 2279 /// \headerfile <x86intrin.h> 2280 /// 2281 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2282 /// 2283 /// \param __a 2284 /// A 128-bit unsigned [8 x i16] vector. 2285 /// \param __b 2286 /// A 128-bit unsigned [8 x i16] vector. 2287 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2288 /// of both parameters. 2289 static __inline__ __m128i __DEFAULT_FN_ATTRS 2290 _mm_adds_epu16(__m128i __a, __m128i __b) 2291 { 2292 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 2293 } 2294 2295 /// Computes the rounded averages of corresponding elements of two 2296 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2297 /// corresponding element of a 128-bit result vector of [16 x i8]. 2298 /// 2299 /// \headerfile <x86intrin.h> 2300 /// 2301 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2302 /// 2303 /// \param __a 2304 /// A 128-bit unsigned [16 x i8] vector. 2305 /// \param __b 2306 /// A 128-bit unsigned [16 x i8] vector. 2307 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2308 /// averages of both parameters. 2309 static __inline__ __m128i __DEFAULT_FN_ATTRS 2310 _mm_avg_epu8(__m128i __a, __m128i __b) 2311 { 2312 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2313 } 2314 2315 /// Computes the rounded averages of corresponding elements of two 2316 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2317 /// corresponding element of a 128-bit result vector of [8 x i16]. 2318 /// 2319 /// \headerfile <x86intrin.h> 2320 /// 2321 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2322 /// 2323 /// \param __a 2324 /// A 128-bit unsigned [8 x i16] vector. 2325 /// \param __b 2326 /// A 128-bit unsigned [8 x i16] vector. 2327 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2328 /// averages of both parameters. 2329 static __inline__ __m128i __DEFAULT_FN_ATTRS 2330 _mm_avg_epu16(__m128i __a, __m128i __b) 2331 { 2332 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2333 } 2334 2335 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2336 /// vectors, producing eight intermediate 32-bit signed integer products, and 2337 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2338 /// [4 x i32] vector. 2339 /// 2340 /// For example, bits [15:0] of both parameters are multiplied producing a 2341 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2342 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2343 /// of the result. 2344 /// 2345 /// \headerfile <x86intrin.h> 2346 /// 2347 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2348 /// 2349 /// \param __a 2350 /// A 128-bit signed [8 x i16] vector. 2351 /// \param __b 2352 /// A 128-bit signed [8 x i16] vector. 2353 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2354 /// of both parameters. 2355 static __inline__ __m128i __DEFAULT_FN_ATTRS 2356 _mm_madd_epi16(__m128i __a, __m128i __b) 2357 { 2358 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2359 } 2360 2361 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2362 /// vectors, saving the greater value from each comparison in the 2363 /// corresponding element of a 128-bit result vector of [8 x i16]. 2364 /// 2365 /// \headerfile <x86intrin.h> 2366 /// 2367 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2368 /// 2369 /// \param __a 2370 /// A 128-bit signed [8 x i16] vector. 2371 /// \param __b 2372 /// A 128-bit signed [8 x i16] vector. 2373 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2374 /// each comparison. 2375 static __inline__ __m128i __DEFAULT_FN_ATTRS 2376 _mm_max_epi16(__m128i __a, __m128i __b) 2377 { 2378 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); 2379 } 2380 2381 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2382 /// vectors, saving the greater value from each comparison in the 2383 /// corresponding element of a 128-bit result vector of [16 x i8]. 2384 /// 2385 /// \headerfile <x86intrin.h> 2386 /// 2387 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2388 /// 2389 /// \param __a 2390 /// A 128-bit unsigned [16 x i8] vector. 2391 /// \param __b 2392 /// A 128-bit unsigned [16 x i8] vector. 2393 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2394 /// each comparison. 2395 static __inline__ __m128i __DEFAULT_FN_ATTRS 2396 _mm_max_epu8(__m128i __a, __m128i __b) 2397 { 2398 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); 2399 } 2400 2401 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2402 /// vectors, saving the smaller value from each comparison in the 2403 /// corresponding element of a 128-bit result vector of [8 x i16]. 2404 /// 2405 /// \headerfile <x86intrin.h> 2406 /// 2407 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2408 /// 2409 /// \param __a 2410 /// A 128-bit signed [8 x i16] vector. 2411 /// \param __b 2412 /// A 128-bit signed [8 x i16] vector. 2413 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2414 /// each comparison. 2415 static __inline__ __m128i __DEFAULT_FN_ATTRS 2416 _mm_min_epi16(__m128i __a, __m128i __b) 2417 { 2418 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); 2419 } 2420 2421 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2422 /// vectors, saving the smaller value from each comparison in the 2423 /// corresponding element of a 128-bit result vector of [16 x i8]. 2424 /// 2425 /// \headerfile <x86intrin.h> 2426 /// 2427 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2428 /// 2429 /// \param __a 2430 /// A 128-bit unsigned [16 x i8] vector. 2431 /// \param __b 2432 /// A 128-bit unsigned [16 x i8] vector. 2433 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2434 /// each comparison. 2435 static __inline__ __m128i __DEFAULT_FN_ATTRS 2436 _mm_min_epu8(__m128i __a, __m128i __b) 2437 { 2438 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); 2439 } 2440 2441 /// Multiplies the corresponding elements of two signed [8 x i16] 2442 /// vectors, saving the upper 16 bits of each 32-bit product in the 2443 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2444 /// 2445 /// \headerfile <x86intrin.h> 2446 /// 2447 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2448 /// 2449 /// \param __a 2450 /// A 128-bit signed [8 x i16] vector. 2451 /// \param __b 2452 /// A 128-bit signed [8 x i16] vector. 2453 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2454 /// each of the eight 32-bit products. 2455 static __inline__ __m128i __DEFAULT_FN_ATTRS 2456 _mm_mulhi_epi16(__m128i __a, __m128i __b) 2457 { 2458 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2459 } 2460 2461 /// Multiplies the corresponding elements of two unsigned [8 x i16] 2462 /// vectors, saving the upper 16 bits of each 32-bit product in the 2463 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2464 /// 2465 /// \headerfile <x86intrin.h> 2466 /// 2467 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2468 /// 2469 /// \param __a 2470 /// A 128-bit unsigned [8 x i16] vector. 2471 /// \param __b 2472 /// A 128-bit unsigned [8 x i16] vector. 2473 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2474 /// of each of the eight 32-bit products. 2475 static __inline__ __m128i __DEFAULT_FN_ATTRS 2476 _mm_mulhi_epu16(__m128i __a, __m128i __b) 2477 { 2478 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2479 } 2480 2481 /// Multiplies the corresponding elements of two signed [8 x i16] 2482 /// vectors, saving the lower 16 bits of each 32-bit product in the 2483 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2484 /// 2485 /// \headerfile <x86intrin.h> 2486 /// 2487 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2488 /// 2489 /// \param __a 2490 /// A 128-bit signed [8 x i16] vector. 2491 /// \param __b 2492 /// A 128-bit signed [8 x i16] vector. 2493 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2494 /// each of the eight 32-bit products. 2495 static __inline__ __m128i __DEFAULT_FN_ATTRS 2496 _mm_mullo_epi16(__m128i __a, __m128i __b) 2497 { 2498 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2499 } 2500 2501 /// Multiplies 32-bit unsigned integer values contained in the lower bits 2502 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2503 /// product. 2504 /// 2505 /// \headerfile <x86intrin.h> 2506 /// 2507 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2508 /// 2509 /// \param __a 2510 /// A 64-bit integer containing one of the source operands. 2511 /// \param __b 2512 /// A 64-bit integer containing one of the source operands. 2513 /// \returns A 64-bit integer vector containing the product of both operands. 2514 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2515 _mm_mul_su32(__m64 __a, __m64 __b) 2516 { 2517 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2518 } 2519 2520 /// Multiplies 32-bit unsigned integer values contained in the lower 2521 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2522 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2523 /// 2524 /// \headerfile <x86intrin.h> 2525 /// 2526 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2527 /// 2528 /// \param __a 2529 /// A [2 x i64] vector containing one of the source operands. 2530 /// \param __b 2531 /// A [2 x i64] vector containing one of the source operands. 2532 /// \returns A [2 x i64] vector containing the product of both operands. 2533 static __inline__ __m128i __DEFAULT_FN_ATTRS 2534 _mm_mul_epu32(__m128i __a, __m128i __b) 2535 { 2536 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2537 } 2538 2539 /// Computes the absolute differences of corresponding 8-bit integer 2540 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2541 /// separately sums the second 8 absolute differences. Packs these two 2542 /// unsigned 16-bit integer sums into the upper and lower elements of a 2543 /// [2 x i64] vector. 2544 /// 2545 /// \headerfile <x86intrin.h> 2546 /// 2547 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2548 /// 2549 /// \param __a 2550 /// A 128-bit integer vector containing one of the source operands. 2551 /// \param __b 2552 /// A 128-bit integer vector containing one of the source operands. 2553 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2554 /// differences between both operands. 2555 static __inline__ __m128i __DEFAULT_FN_ATTRS 2556 _mm_sad_epu8(__m128i __a, __m128i __b) 2557 { 2558 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2559 } 2560 2561 /// Subtracts the corresponding 8-bit integer values in the operands. 2562 /// 2563 /// \headerfile <x86intrin.h> 2564 /// 2565 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2566 /// 2567 /// \param __a 2568 /// A 128-bit integer vector containing the minuends. 2569 /// \param __b 2570 /// A 128-bit integer vector containing the subtrahends. 2571 /// \returns A 128-bit integer vector containing the differences of the values 2572 /// in the operands. 2573 static __inline__ __m128i __DEFAULT_FN_ATTRS 2574 _mm_sub_epi8(__m128i __a, __m128i __b) 2575 { 2576 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2577 } 2578 2579 /// Subtracts the corresponding 16-bit integer values in the operands. 2580 /// 2581 /// \headerfile <x86intrin.h> 2582 /// 2583 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2584 /// 2585 /// \param __a 2586 /// A 128-bit integer vector containing the minuends. 2587 /// \param __b 2588 /// A 128-bit integer vector containing the subtrahends. 2589 /// \returns A 128-bit integer vector containing the differences of the values 2590 /// in the operands. 2591 static __inline__ __m128i __DEFAULT_FN_ATTRS 2592 _mm_sub_epi16(__m128i __a, __m128i __b) 2593 { 2594 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2595 } 2596 2597 /// Subtracts the corresponding 32-bit integer values in the operands. 2598 /// 2599 /// \headerfile <x86intrin.h> 2600 /// 2601 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2602 /// 2603 /// \param __a 2604 /// A 128-bit integer vector containing the minuends. 2605 /// \param __b 2606 /// A 128-bit integer vector containing the subtrahends. 2607 /// \returns A 128-bit integer vector containing the differences of the values 2608 /// in the operands. 2609 static __inline__ __m128i __DEFAULT_FN_ATTRS 2610 _mm_sub_epi32(__m128i __a, __m128i __b) 2611 { 2612 return (__m128i)((__v4su)__a - (__v4su)__b); 2613 } 2614 2615 /// Subtracts signed or unsigned 64-bit integer values and writes the 2616 /// difference to the corresponding bits in the destination. 2617 /// 2618 /// \headerfile <x86intrin.h> 2619 /// 2620 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2621 /// 2622 /// \param __a 2623 /// A 64-bit integer vector containing the minuend. 2624 /// \param __b 2625 /// A 64-bit integer vector containing the subtrahend. 2626 /// \returns A 64-bit integer vector containing the difference of the values in 2627 /// the operands. 2628 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2629 _mm_sub_si64(__m64 __a, __m64 __b) 2630 { 2631 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2632 } 2633 2634 /// Subtracts the corresponding elements of two [2 x i64] vectors. 2635 /// 2636 /// \headerfile <x86intrin.h> 2637 /// 2638 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2639 /// 2640 /// \param __a 2641 /// A 128-bit integer vector containing the minuends. 2642 /// \param __b 2643 /// A 128-bit integer vector containing the subtrahends. 2644 /// \returns A 128-bit integer vector containing the differences of the values 2645 /// in the operands. 2646 static __inline__ __m128i __DEFAULT_FN_ATTRS 2647 _mm_sub_epi64(__m128i __a, __m128i __b) 2648 { 2649 return (__m128i)((__v2du)__a - (__v2du)__b); 2650 } 2651 2652 /// Subtracts corresponding 8-bit signed integer values in the input and 2653 /// returns the differences in the corresponding bytes in the destination. 2654 /// Differences greater than 0x7F are saturated to 0x7F, and differences less 2655 /// than 0x80 are saturated to 0x80. 2656 /// 2657 /// \headerfile <x86intrin.h> 2658 /// 2659 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2660 /// 2661 /// \param __a 2662 /// A 128-bit integer vector containing the minuends. 2663 /// \param __b 2664 /// A 128-bit integer vector containing the subtrahends. 2665 /// \returns A 128-bit integer vector containing the differences of the values 2666 /// in the operands. 2667 static __inline__ __m128i __DEFAULT_FN_ATTRS 2668 _mm_subs_epi8(__m128i __a, __m128i __b) 2669 { 2670 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 2671 } 2672 2673 /// Subtracts corresponding 16-bit signed integer values in the input and 2674 /// returns the differences in the corresponding bytes in the destination. 2675 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2676 /// than 0x8000 are saturated to 0x8000. 2677 /// 2678 /// \headerfile <x86intrin.h> 2679 /// 2680 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2681 /// 2682 /// \param __a 2683 /// A 128-bit integer vector containing the minuends. 2684 /// \param __b 2685 /// A 128-bit integer vector containing the subtrahends. 2686 /// \returns A 128-bit integer vector containing the differences of the values 2687 /// in the operands. 2688 static __inline__ __m128i __DEFAULT_FN_ATTRS 2689 _mm_subs_epi16(__m128i __a, __m128i __b) 2690 { 2691 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 2692 } 2693 2694 /// Subtracts corresponding 8-bit unsigned integer values in the input 2695 /// and returns the differences in the corresponding bytes in the 2696 /// destination. Differences less than 0x00 are saturated to 0x00. 2697 /// 2698 /// \headerfile <x86intrin.h> 2699 /// 2700 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2701 /// 2702 /// \param __a 2703 /// A 128-bit integer vector containing the minuends. 2704 /// \param __b 2705 /// A 128-bit integer vector containing the subtrahends. 2706 /// \returns A 128-bit integer vector containing the unsigned integer 2707 /// differences of the values in the operands. 2708 static __inline__ __m128i __DEFAULT_FN_ATTRS 2709 _mm_subs_epu8(__m128i __a, __m128i __b) 2710 { 2711 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 2712 } 2713 2714 /// Subtracts corresponding 16-bit unsigned integer values in the input 2715 /// and returns the differences in the corresponding bytes in the 2716 /// destination. Differences less than 0x0000 are saturated to 0x0000. 2717 /// 2718 /// \headerfile <x86intrin.h> 2719 /// 2720 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2721 /// 2722 /// \param __a 2723 /// A 128-bit integer vector containing the minuends. 2724 /// \param __b 2725 /// A 128-bit integer vector containing the subtrahends. 2726 /// \returns A 128-bit integer vector containing the unsigned integer 2727 /// differences of the values in the operands. 2728 static __inline__ __m128i __DEFAULT_FN_ATTRS 2729 _mm_subs_epu16(__m128i __a, __m128i __b) 2730 { 2731 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 2732 } 2733 2734 /// Performs a bitwise AND of two 128-bit integer vectors. 2735 /// 2736 /// \headerfile <x86intrin.h> 2737 /// 2738 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2739 /// 2740 /// \param __a 2741 /// A 128-bit integer vector containing one of the source operands. 2742 /// \param __b 2743 /// A 128-bit integer vector containing one of the source operands. 2744 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2745 /// in both operands. 2746 static __inline__ __m128i __DEFAULT_FN_ATTRS 2747 _mm_and_si128(__m128i __a, __m128i __b) 2748 { 2749 return (__m128i)((__v2du)__a & (__v2du)__b); 2750 } 2751 2752 /// Performs a bitwise AND of two 128-bit integer vectors, using the 2753 /// one's complement of the values contained in the first source operand. 2754 /// 2755 /// \headerfile <x86intrin.h> 2756 /// 2757 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2758 /// 2759 /// \param __a 2760 /// A 128-bit vector containing the left source operand. The one's complement 2761 /// of this value is used in the bitwise AND. 2762 /// \param __b 2763 /// A 128-bit vector containing the right source operand. 2764 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2765 /// complement of the first operand and the values in the second operand. 2766 static __inline__ __m128i __DEFAULT_FN_ATTRS 2767 _mm_andnot_si128(__m128i __a, __m128i __b) 2768 { 2769 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2770 } 2771 /// Performs a bitwise OR of two 128-bit integer vectors. 2772 /// 2773 /// \headerfile <x86intrin.h> 2774 /// 2775 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2776 /// 2777 /// \param __a 2778 /// A 128-bit integer vector containing one of the source operands. 2779 /// \param __b 2780 /// A 128-bit integer vector containing one of the source operands. 2781 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2782 /// in both operands. 2783 static __inline__ __m128i __DEFAULT_FN_ATTRS 2784 _mm_or_si128(__m128i __a, __m128i __b) 2785 { 2786 return (__m128i)((__v2du)__a | (__v2du)__b); 2787 } 2788 2789 /// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2790 /// 2791 /// \headerfile <x86intrin.h> 2792 /// 2793 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2794 /// 2795 /// \param __a 2796 /// A 128-bit integer vector containing one of the source operands. 2797 /// \param __b 2798 /// A 128-bit integer vector containing one of the source operands. 2799 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2800 /// values in both operands. 2801 static __inline__ __m128i __DEFAULT_FN_ATTRS 2802 _mm_xor_si128(__m128i __a, __m128i __b) 2803 { 2804 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2805 } 2806 2807 /// Left-shifts the 128-bit integer vector operand by the specified 2808 /// number of bytes. Low-order bits are cleared. 2809 /// 2810 /// \headerfile <x86intrin.h> 2811 /// 2812 /// \code 2813 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2814 /// \endcode 2815 /// 2816 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2817 /// 2818 /// \param a 2819 /// A 128-bit integer vector containing the source operand. 2820 /// \param imm 2821 /// An immediate value specifying the number of bytes to left-shift operand 2822 /// \a a. 2823 /// \returns A 128-bit integer vector containing the left-shifted value. 2824 #define _mm_slli_si128(a, imm) \ 2825 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 2826 2827 #define _mm_bslli_si128(a, imm) \ 2828 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 2829 2830 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2831 /// by the specified number of bits. Low-order bits are cleared. 2832 /// 2833 /// \headerfile <x86intrin.h> 2834 /// 2835 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2836 /// 2837 /// \param __a 2838 /// A 128-bit integer vector containing the source operand. 2839 /// \param __count 2840 /// An integer value specifying the number of bits to left-shift each value 2841 /// in operand \a __a. 2842 /// \returns A 128-bit integer vector containing the left-shifted values. 2843 static __inline__ __m128i __DEFAULT_FN_ATTRS 2844 _mm_slli_epi16(__m128i __a, int __count) 2845 { 2846 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2847 } 2848 2849 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2850 /// by the specified number of bits. Low-order bits are cleared. 2851 /// 2852 /// \headerfile <x86intrin.h> 2853 /// 2854 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2855 /// 2856 /// \param __a 2857 /// A 128-bit integer vector containing the source operand. 2858 /// \param __count 2859 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2860 /// to left-shift each value in operand \a __a. 2861 /// \returns A 128-bit integer vector containing the left-shifted values. 2862 static __inline__ __m128i __DEFAULT_FN_ATTRS 2863 _mm_sll_epi16(__m128i __a, __m128i __count) 2864 { 2865 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2866 } 2867 2868 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2869 /// by the specified number of bits. Low-order bits are cleared. 2870 /// 2871 /// \headerfile <x86intrin.h> 2872 /// 2873 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2874 /// 2875 /// \param __a 2876 /// A 128-bit integer vector containing the source operand. 2877 /// \param __count 2878 /// An integer value specifying the number of bits to left-shift each value 2879 /// in operand \a __a. 2880 /// \returns A 128-bit integer vector containing the left-shifted values. 2881 static __inline__ __m128i __DEFAULT_FN_ATTRS 2882 _mm_slli_epi32(__m128i __a, int __count) 2883 { 2884 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2885 } 2886 2887 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2888 /// by the specified number of bits. Low-order bits are cleared. 2889 /// 2890 /// \headerfile <x86intrin.h> 2891 /// 2892 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2893 /// 2894 /// \param __a 2895 /// A 128-bit integer vector containing the source operand. 2896 /// \param __count 2897 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2898 /// to left-shift each value in operand \a __a. 2899 /// \returns A 128-bit integer vector containing the left-shifted values. 2900 static __inline__ __m128i __DEFAULT_FN_ATTRS 2901 _mm_sll_epi32(__m128i __a, __m128i __count) 2902 { 2903 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2904 } 2905 2906 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2907 /// by the specified number of bits. Low-order bits are cleared. 2908 /// 2909 /// \headerfile <x86intrin.h> 2910 /// 2911 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2912 /// 2913 /// \param __a 2914 /// A 128-bit integer vector containing the source operand. 2915 /// \param __count 2916 /// An integer value specifying the number of bits to left-shift each value 2917 /// in operand \a __a. 2918 /// \returns A 128-bit integer vector containing the left-shifted values. 2919 static __inline__ __m128i __DEFAULT_FN_ATTRS 2920 _mm_slli_epi64(__m128i __a, int __count) 2921 { 2922 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2923 } 2924 2925 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2926 /// by the specified number of bits. Low-order bits are cleared. 2927 /// 2928 /// \headerfile <x86intrin.h> 2929 /// 2930 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2931 /// 2932 /// \param __a 2933 /// A 128-bit integer vector containing the source operand. 2934 /// \param __count 2935 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2936 /// to left-shift each value in operand \a __a. 2937 /// \returns A 128-bit integer vector containing the left-shifted values. 2938 static __inline__ __m128i __DEFAULT_FN_ATTRS 2939 _mm_sll_epi64(__m128i __a, __m128i __count) 2940 { 2941 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2942 } 2943 2944 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2945 /// by the specified number of bits. High-order bits are filled with the sign 2946 /// bit of the initial value. 2947 /// 2948 /// \headerfile <x86intrin.h> 2949 /// 2950 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2951 /// 2952 /// \param __a 2953 /// A 128-bit integer vector containing the source operand. 2954 /// \param __count 2955 /// An integer value specifying the number of bits to right-shift each value 2956 /// in operand \a __a. 2957 /// \returns A 128-bit integer vector containing the right-shifted values. 2958 static __inline__ __m128i __DEFAULT_FN_ATTRS 2959 _mm_srai_epi16(__m128i __a, int __count) 2960 { 2961 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2962 } 2963 2964 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2965 /// by the specified number of bits. High-order bits are filled with the sign 2966 /// bit of the initial value. 2967 /// 2968 /// \headerfile <x86intrin.h> 2969 /// 2970 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2971 /// 2972 /// \param __a 2973 /// A 128-bit integer vector containing the source operand. 2974 /// \param __count 2975 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2976 /// to right-shift each value in operand \a __a. 2977 /// \returns A 128-bit integer vector containing the right-shifted values. 2978 static __inline__ __m128i __DEFAULT_FN_ATTRS 2979 _mm_sra_epi16(__m128i __a, __m128i __count) 2980 { 2981 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2982 } 2983 2984 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2985 /// by the specified number of bits. High-order bits are filled with the sign 2986 /// bit of the initial value. 2987 /// 2988 /// \headerfile <x86intrin.h> 2989 /// 2990 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2991 /// 2992 /// \param __a 2993 /// A 128-bit integer vector containing the source operand. 2994 /// \param __count 2995 /// An integer value specifying the number of bits to right-shift each value 2996 /// in operand \a __a. 2997 /// \returns A 128-bit integer vector containing the right-shifted values. 2998 static __inline__ __m128i __DEFAULT_FN_ATTRS 2999 _mm_srai_epi32(__m128i __a, int __count) 3000 { 3001 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 3002 } 3003 3004 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 3005 /// by the specified number of bits. High-order bits are filled with the sign 3006 /// bit of the initial value. 3007 /// 3008 /// \headerfile <x86intrin.h> 3009 /// 3010 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 3011 /// 3012 /// \param __a 3013 /// A 128-bit integer vector containing the source operand. 3014 /// \param __count 3015 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3016 /// to right-shift each value in operand \a __a. 3017 /// \returns A 128-bit integer vector containing the right-shifted values. 3018 static __inline__ __m128i __DEFAULT_FN_ATTRS 3019 _mm_sra_epi32(__m128i __a, __m128i __count) 3020 { 3021 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 3022 } 3023 3024 /// Right-shifts the 128-bit integer vector operand by the specified 3025 /// number of bytes. High-order bits are cleared. 3026 /// 3027 /// \headerfile <x86intrin.h> 3028 /// 3029 /// \code 3030 /// __m128i _mm_srli_si128(__m128i a, const int imm); 3031 /// \endcode 3032 /// 3033 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 3034 /// 3035 /// \param a 3036 /// A 128-bit integer vector containing the source operand. 3037 /// \param imm 3038 /// An immediate value specifying the number of bytes to right-shift operand 3039 /// \a a. 3040 /// \returns A 128-bit integer vector containing the right-shifted value. 3041 #define _mm_srli_si128(a, imm) \ 3042 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 3043 3044 #define _mm_bsrli_si128(a, imm) \ 3045 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) 3046 3047 /// Right-shifts each of 16-bit values in the 128-bit integer vector 3048 /// operand by the specified number of bits. High-order bits are cleared. 3049 /// 3050 /// \headerfile <x86intrin.h> 3051 /// 3052 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3053 /// 3054 /// \param __a 3055 /// A 128-bit integer vector containing the source operand. 3056 /// \param __count 3057 /// An integer value specifying the number of bits to right-shift each value 3058 /// in operand \a __a. 3059 /// \returns A 128-bit integer vector containing the right-shifted values. 3060 static __inline__ __m128i __DEFAULT_FN_ATTRS 3061 _mm_srli_epi16(__m128i __a, int __count) 3062 { 3063 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 3064 } 3065 3066 /// Right-shifts each of 16-bit values in the 128-bit integer vector 3067 /// operand by the specified number of bits. High-order bits are cleared. 3068 /// 3069 /// \headerfile <x86intrin.h> 3070 /// 3071 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3072 /// 3073 /// \param __a 3074 /// A 128-bit integer vector containing the source operand. 3075 /// \param __count 3076 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3077 /// to right-shift each value in operand \a __a. 3078 /// \returns A 128-bit integer vector containing the right-shifted values. 3079 static __inline__ __m128i __DEFAULT_FN_ATTRS 3080 _mm_srl_epi16(__m128i __a, __m128i __count) 3081 { 3082 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 3083 } 3084 3085 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3086 /// operand by the specified number of bits. High-order bits are cleared. 3087 /// 3088 /// \headerfile <x86intrin.h> 3089 /// 3090 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3091 /// 3092 /// \param __a 3093 /// A 128-bit integer vector containing the source operand. 3094 /// \param __count 3095 /// An integer value specifying the number of bits to right-shift each value 3096 /// in operand \a __a. 3097 /// \returns A 128-bit integer vector containing the right-shifted values. 3098 static __inline__ __m128i __DEFAULT_FN_ATTRS 3099 _mm_srli_epi32(__m128i __a, int __count) 3100 { 3101 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 3102 } 3103 3104 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3105 /// operand by the specified number of bits. High-order bits are cleared. 3106 /// 3107 /// \headerfile <x86intrin.h> 3108 /// 3109 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3110 /// 3111 /// \param __a 3112 /// A 128-bit integer vector containing the source operand. 3113 /// \param __count 3114 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3115 /// to right-shift each value in operand \a __a. 3116 /// \returns A 128-bit integer vector containing the right-shifted values. 3117 static __inline__ __m128i __DEFAULT_FN_ATTRS 3118 _mm_srl_epi32(__m128i __a, __m128i __count) 3119 { 3120 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3121 } 3122 3123 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3124 /// operand by the specified number of bits. High-order bits are cleared. 3125 /// 3126 /// \headerfile <x86intrin.h> 3127 /// 3128 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3129 /// 3130 /// \param __a 3131 /// A 128-bit integer vector containing the source operand. 3132 /// \param __count 3133 /// An integer value specifying the number of bits to right-shift each value 3134 /// in operand \a __a. 3135 /// \returns A 128-bit integer vector containing the right-shifted values. 3136 static __inline__ __m128i __DEFAULT_FN_ATTRS 3137 _mm_srli_epi64(__m128i __a, int __count) 3138 { 3139 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3140 } 3141 3142 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3143 /// operand by the specified number of bits. High-order bits are cleared. 3144 /// 3145 /// \headerfile <x86intrin.h> 3146 /// 3147 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3148 /// 3149 /// \param __a 3150 /// A 128-bit integer vector containing the source operand. 3151 /// \param __count 3152 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3153 /// to right-shift each value in operand \a __a. 3154 /// \returns A 128-bit integer vector containing the right-shifted values. 3155 static __inline__ __m128i __DEFAULT_FN_ATTRS 3156 _mm_srl_epi64(__m128i __a, __m128i __count) 3157 { 3158 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3159 } 3160 3161 /// Compares each of the corresponding 8-bit values of the 128-bit 3162 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF 3163 /// for true. 3164 /// 3165 /// \headerfile <x86intrin.h> 3166 /// 3167 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3168 /// 3169 /// \param __a 3170 /// A 128-bit integer vector. 3171 /// \param __b 3172 /// A 128-bit integer vector. 3173 /// \returns A 128-bit integer vector containing the comparison results. 3174 static __inline__ __m128i __DEFAULT_FN_ATTRS 3175 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 3176 { 3177 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3178 } 3179 3180 /// Compares each of the corresponding 16-bit values of the 128-bit 3181 /// integer vectors for equality. Each comparison yields 0x0 for false, 3182 /// 0xFFFF for true. 3183 /// 3184 /// \headerfile <x86intrin.h> 3185 /// 3186 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3187 /// 3188 /// \param __a 3189 /// A 128-bit integer vector. 3190 /// \param __b 3191 /// A 128-bit integer vector. 3192 /// \returns A 128-bit integer vector containing the comparison results. 3193 static __inline__ __m128i __DEFAULT_FN_ATTRS 3194 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 3195 { 3196 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3197 } 3198 3199 /// Compares each of the corresponding 32-bit values of the 128-bit 3200 /// integer vectors for equality. Each comparison yields 0x0 for false, 3201 /// 0xFFFFFFFF for true. 3202 /// 3203 /// \headerfile <x86intrin.h> 3204 /// 3205 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3206 /// 3207 /// \param __a 3208 /// A 128-bit integer vector. 3209 /// \param __b 3210 /// A 128-bit integer vector. 3211 /// \returns A 128-bit integer vector containing the comparison results. 3212 static __inline__ __m128i __DEFAULT_FN_ATTRS 3213 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 3214 { 3215 return (__m128i)((__v4si)__a == (__v4si)__b); 3216 } 3217 3218 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3219 /// integer vectors to determine if the values in the first operand are 3220 /// greater than those in the second operand. Each comparison yields 0x0 for 3221 /// false, 0xFF for true. 3222 /// 3223 /// \headerfile <x86intrin.h> 3224 /// 3225 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3226 /// 3227 /// \param __a 3228 /// A 128-bit integer vector. 3229 /// \param __b 3230 /// A 128-bit integer vector. 3231 /// \returns A 128-bit integer vector containing the comparison results. 3232 static __inline__ __m128i __DEFAULT_FN_ATTRS 3233 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 3234 { 3235 /* This function always performs a signed comparison, but __v16qi is a char 3236 which may be signed or unsigned, so use __v16qs. */ 3237 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3238 } 3239 3240 /// Compares each of the corresponding signed 16-bit values of the 3241 /// 128-bit integer vectors to determine if the values in the first operand 3242 /// are greater than those in the second operand. 3243 /// 3244 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3245 /// 3246 /// \headerfile <x86intrin.h> 3247 /// 3248 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3249 /// 3250 /// \param __a 3251 /// A 128-bit integer vector. 3252 /// \param __b 3253 /// A 128-bit integer vector. 3254 /// \returns A 128-bit integer vector containing the comparison results. 3255 static __inline__ __m128i __DEFAULT_FN_ATTRS 3256 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 3257 { 3258 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3259 } 3260 3261 /// Compares each of the corresponding signed 32-bit values of the 3262 /// 128-bit integer vectors to determine if the values in the first operand 3263 /// are greater than those in the second operand. 3264 /// 3265 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3266 /// 3267 /// \headerfile <x86intrin.h> 3268 /// 3269 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3270 /// 3271 /// \param __a 3272 /// A 128-bit integer vector. 3273 /// \param __b 3274 /// A 128-bit integer vector. 3275 /// \returns A 128-bit integer vector containing the comparison results. 3276 static __inline__ __m128i __DEFAULT_FN_ATTRS 3277 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 3278 { 3279 return (__m128i)((__v4si)__a > (__v4si)__b); 3280 } 3281 3282 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3283 /// integer vectors to determine if the values in the first operand are less 3284 /// than those in the second operand. 3285 /// 3286 /// Each comparison yields 0x0 for false, 0xFF for true. 3287 /// 3288 /// \headerfile <x86intrin.h> 3289 /// 3290 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3291 /// 3292 /// \param __a 3293 /// A 128-bit integer vector. 3294 /// \param __b 3295 /// A 128-bit integer vector. 3296 /// \returns A 128-bit integer vector containing the comparison results. 3297 static __inline__ __m128i __DEFAULT_FN_ATTRS 3298 _mm_cmplt_epi8(__m128i __a, __m128i __b) 3299 { 3300 return _mm_cmpgt_epi8(__b, __a); 3301 } 3302 3303 /// Compares each of the corresponding signed 16-bit values of the 3304 /// 128-bit integer vectors to determine if the values in the first operand 3305 /// are less than those in the second operand. 3306 /// 3307 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3308 /// 3309 /// \headerfile <x86intrin.h> 3310 /// 3311 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3312 /// 3313 /// \param __a 3314 /// A 128-bit integer vector. 3315 /// \param __b 3316 /// A 128-bit integer vector. 3317 /// \returns A 128-bit integer vector containing the comparison results. 3318 static __inline__ __m128i __DEFAULT_FN_ATTRS 3319 _mm_cmplt_epi16(__m128i __a, __m128i __b) 3320 { 3321 return _mm_cmpgt_epi16(__b, __a); 3322 } 3323 3324 /// Compares each of the corresponding signed 32-bit values of the 3325 /// 128-bit integer vectors to determine if the values in the first operand 3326 /// are less than those in the second operand. 3327 /// 3328 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3329 /// 3330 /// \headerfile <x86intrin.h> 3331 /// 3332 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3333 /// 3334 /// \param __a 3335 /// A 128-bit integer vector. 3336 /// \param __b 3337 /// A 128-bit integer vector. 3338 /// \returns A 128-bit integer vector containing the comparison results. 3339 static __inline__ __m128i __DEFAULT_FN_ATTRS 3340 _mm_cmplt_epi32(__m128i __a, __m128i __b) 3341 { 3342 return _mm_cmpgt_epi32(__b, __a); 3343 } 3344 3345 #ifdef __x86_64__ 3346 /// Converts a 64-bit signed integer value from the second operand into a 3347 /// double-precision value and returns it in the lower element of a [2 x 3348 /// double] vector; the upper element of the returned vector is copied from 3349 /// the upper element of the first operand. 3350 /// 3351 /// \headerfile <x86intrin.h> 3352 /// 3353 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3354 /// 3355 /// \param __a 3356 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3357 /// copied to the upper 64 bits of the destination. 3358 /// \param __b 3359 /// A 64-bit signed integer operand containing the value to be converted. 3360 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3361 /// converted value of the second operand. The upper 64 bits are copied from 3362 /// the upper 64 bits of the first operand. 3363 static __inline__ __m128d __DEFAULT_FN_ATTRS 3364 _mm_cvtsi64_sd(__m128d __a, long long __b) 3365 { 3366 __a[0] = __b; 3367 return __a; 3368 } 3369 3370 /// Converts the first (lower) element of a vector of [2 x double] into a 3371 /// 64-bit signed integer value, according to the current rounding mode. 3372 /// 3373 /// \headerfile <x86intrin.h> 3374 /// 3375 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3376 /// 3377 /// \param __a 3378 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3379 /// conversion. 3380 /// \returns A 64-bit signed integer containing the converted value. 3381 static __inline__ long long __DEFAULT_FN_ATTRS 3382 _mm_cvtsd_si64(__m128d __a) 3383 { 3384 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3385 } 3386 3387 /// Converts the first (lower) element of a vector of [2 x double] into a 3388 /// 64-bit signed integer value, truncating the result when it is inexact. 3389 /// 3390 /// \headerfile <x86intrin.h> 3391 /// 3392 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3393 /// instruction. 3394 /// 3395 /// \param __a 3396 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3397 /// conversion. 3398 /// \returns A 64-bit signed integer containing the converted value. 3399 static __inline__ long long __DEFAULT_FN_ATTRS 3400 _mm_cvttsd_si64(__m128d __a) 3401 { 3402 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3403 } 3404 #endif 3405 3406 /// Converts a vector of [4 x i32] into a vector of [4 x float]. 3407 /// 3408 /// \headerfile <x86intrin.h> 3409 /// 3410 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3411 /// 3412 /// \param __a 3413 /// A 128-bit integer vector. 3414 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3415 static __inline__ __m128 __DEFAULT_FN_ATTRS 3416 _mm_cvtepi32_ps(__m128i __a) 3417 { 3418 return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); 3419 } 3420 3421 /// Converts a vector of [4 x float] into a vector of [4 x i32]. 3422 /// 3423 /// \headerfile <x86intrin.h> 3424 /// 3425 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3426 /// 3427 /// \param __a 3428 /// A 128-bit vector of [4 x float]. 3429 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3430 /// values. 3431 static __inline__ __m128i __DEFAULT_FN_ATTRS 3432 _mm_cvtps_epi32(__m128 __a) 3433 { 3434 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3435 } 3436 3437 /// Converts a vector of [4 x float] into a vector of [4 x i32], 3438 /// truncating the result when it is inexact. 3439 /// 3440 /// \headerfile <x86intrin.h> 3441 /// 3442 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3443 /// instruction. 3444 /// 3445 /// \param __a 3446 /// A 128-bit vector of [4 x float]. 3447 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3448 static __inline__ __m128i __DEFAULT_FN_ATTRS 3449 _mm_cvttps_epi32(__m128 __a) 3450 { 3451 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3452 } 3453 3454 /// Returns a vector of [4 x i32] where the lowest element is the input 3455 /// operand and the remaining elements are zero. 3456 /// 3457 /// \headerfile <x86intrin.h> 3458 /// 3459 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3460 /// 3461 /// \param __a 3462 /// A 32-bit signed integer operand. 3463 /// \returns A 128-bit vector of [4 x i32]. 3464 static __inline__ __m128i __DEFAULT_FN_ATTRS 3465 _mm_cvtsi32_si128(int __a) 3466 { 3467 return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 }; 3468 } 3469 3470 #ifdef __x86_64__ 3471 /// Returns a vector of [2 x i64] where the lower element is the input 3472 /// operand and the upper element is zero. 3473 /// 3474 /// \headerfile <x86intrin.h> 3475 /// 3476 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3477 /// 3478 /// \param __a 3479 /// A 64-bit signed integer operand containing the value to be converted. 3480 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3481 static __inline__ __m128i __DEFAULT_FN_ATTRS 3482 _mm_cvtsi64_si128(long long __a) 3483 { 3484 return __extension__ (__m128i)(__v2di){ __a, 0 }; 3485 } 3486 #endif 3487 3488 /// Moves the least significant 32 bits of a vector of [4 x i32] to a 3489 /// 32-bit signed integer value. 3490 /// 3491 /// \headerfile <x86intrin.h> 3492 /// 3493 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3494 /// 3495 /// \param __a 3496 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3497 /// destination. 3498 /// \returns A 32-bit signed integer containing the moved value. 3499 static __inline__ int __DEFAULT_FN_ATTRS 3500 _mm_cvtsi128_si32(__m128i __a) 3501 { 3502 __v4si __b = (__v4si)__a; 3503 return __b[0]; 3504 } 3505 3506 #ifdef __x86_64__ 3507 /// Moves the least significant 64 bits of a vector of [2 x i64] to a 3508 /// 64-bit signed integer value. 3509 /// 3510 /// \headerfile <x86intrin.h> 3511 /// 3512 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3513 /// 3514 /// \param __a 3515 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3516 /// destination. 3517 /// \returns A 64-bit signed integer containing the moved value. 3518 static __inline__ long long __DEFAULT_FN_ATTRS 3519 _mm_cvtsi128_si64(__m128i __a) 3520 { 3521 return __a[0]; 3522 } 3523 #endif 3524 3525 /// Moves packed integer values from an aligned 128-bit memory location 3526 /// to elements in a 128-bit integer vector. 3527 /// 3528 /// \headerfile <x86intrin.h> 3529 /// 3530 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3531 /// 3532 /// \param __p 3533 /// An aligned pointer to a memory location containing integer values. 3534 /// \returns A 128-bit integer vector containing the moved values. 3535 static __inline__ __m128i __DEFAULT_FN_ATTRS 3536 _mm_load_si128(__m128i const *__p) 3537 { 3538 return *__p; 3539 } 3540 3541 /// Moves packed integer values from an unaligned 128-bit memory location 3542 /// to elements in a 128-bit integer vector. 3543 /// 3544 /// \headerfile <x86intrin.h> 3545 /// 3546 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3547 /// 3548 /// \param __p 3549 /// A pointer to a memory location containing integer values. 3550 /// \returns A 128-bit integer vector containing the moved values. 3551 static __inline__ __m128i __DEFAULT_FN_ATTRS 3552 _mm_loadu_si128(__m128i_u const *__p) 3553 { 3554 struct __loadu_si128 { 3555 __m128i_u __v; 3556 } __attribute__((__packed__, __may_alias__)); 3557 return ((const struct __loadu_si128*)__p)->__v; 3558 } 3559 3560 /// Returns a vector of [2 x i64] where the lower element is taken from 3561 /// the lower element of the operand, and the upper element is zero. 3562 /// 3563 /// \headerfile <x86intrin.h> 3564 /// 3565 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3566 /// 3567 /// \param __p 3568 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3569 /// the destination. 3570 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3571 /// moved value. The higher order bits are cleared. 3572 static __inline__ __m128i __DEFAULT_FN_ATTRS 3573 _mm_loadl_epi64(__m128i_u const *__p) 3574 { 3575 struct __mm_loadl_epi64_struct { 3576 long long __u; 3577 } __attribute__((__packed__, __may_alias__)); 3578 return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 3579 } 3580 3581 /// Generates a 128-bit vector of [4 x i32] with unspecified content. 3582 /// This could be used as an argument to another intrinsic function where the 3583 /// argument is required but the value is not actually used. 3584 /// 3585 /// \headerfile <x86intrin.h> 3586 /// 3587 /// This intrinsic has no corresponding instruction. 3588 /// 3589 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3590 static __inline__ __m128i __DEFAULT_FN_ATTRS 3591 _mm_undefined_si128(void) 3592 { 3593 return (__m128i)__builtin_ia32_undef128(); 3594 } 3595 3596 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3597 /// the specified 64-bit integer values. 3598 /// 3599 /// \headerfile <x86intrin.h> 3600 /// 3601 /// This intrinsic is a utility function and does not correspond to a specific 3602 /// instruction. 3603 /// 3604 /// \param __q1 3605 /// A 64-bit integer value used to initialize the upper 64 bits of the 3606 /// destination vector of [2 x i64]. 3607 /// \param __q0 3608 /// A 64-bit integer value used to initialize the lower 64 bits of the 3609 /// destination vector of [2 x i64]. 3610 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3611 /// provided in the operands. 3612 static __inline__ __m128i __DEFAULT_FN_ATTRS 3613 _mm_set_epi64x(long long __q1, long long __q0) 3614 { 3615 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 3616 } 3617 3618 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3619 /// the specified 64-bit integer values. 3620 /// 3621 /// \headerfile <x86intrin.h> 3622 /// 3623 /// This intrinsic is a utility function and does not correspond to a specific 3624 /// instruction. 3625 /// 3626 /// \param __q1 3627 /// A 64-bit integer value used to initialize the upper 64 bits of the 3628 /// destination vector of [2 x i64]. 3629 /// \param __q0 3630 /// A 64-bit integer value used to initialize the lower 64 bits of the 3631 /// destination vector of [2 x i64]. 3632 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3633 /// provided in the operands. 3634 static __inline__ __m128i __DEFAULT_FN_ATTRS 3635 _mm_set_epi64(__m64 __q1, __m64 __q0) 3636 { 3637 return _mm_set_epi64x((long long)__q1, (long long)__q0); 3638 } 3639 3640 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3641 /// the specified 32-bit integer values. 3642 /// 3643 /// \headerfile <x86intrin.h> 3644 /// 3645 /// This intrinsic is a utility function and does not correspond to a specific 3646 /// instruction. 3647 /// 3648 /// \param __i3 3649 /// A 32-bit integer value used to initialize bits [127:96] of the 3650 /// destination vector. 3651 /// \param __i2 3652 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3653 /// vector. 3654 /// \param __i1 3655 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3656 /// vector. 3657 /// \param __i0 3658 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3659 /// vector. 3660 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3661 /// provided in the operands. 3662 static __inline__ __m128i __DEFAULT_FN_ATTRS 3663 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 3664 { 3665 return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3666 } 3667 3668 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3669 /// the specified 16-bit integer values. 3670 /// 3671 /// \headerfile <x86intrin.h> 3672 /// 3673 /// This intrinsic is a utility function and does not correspond to a specific 3674 /// instruction. 3675 /// 3676 /// \param __w7 3677 /// A 16-bit integer value used to initialize bits [127:112] of the 3678 /// destination vector. 3679 /// \param __w6 3680 /// A 16-bit integer value used to initialize bits [111:96] of the 3681 /// destination vector. 3682 /// \param __w5 3683 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3684 /// vector. 3685 /// \param __w4 3686 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3687 /// vector. 3688 /// \param __w3 3689 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3690 /// vector. 3691 /// \param __w2 3692 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3693 /// vector. 3694 /// \param __w1 3695 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3696 /// vector. 3697 /// \param __w0 3698 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3699 /// vector. 3700 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3701 /// provided in the operands. 3702 static __inline__ __m128i __DEFAULT_FN_ATTRS 3703 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 3704 { 3705 return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3706 } 3707 3708 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3709 /// the specified 8-bit integer values. 3710 /// 3711 /// \headerfile <x86intrin.h> 3712 /// 3713 /// This intrinsic is a utility function and does not correspond to a specific 3714 /// instruction. 3715 /// 3716 /// \param __b15 3717 /// Initializes bits [127:120] of the destination vector. 3718 /// \param __b14 3719 /// Initializes bits [119:112] of the destination vector. 3720 /// \param __b13 3721 /// Initializes bits [111:104] of the destination vector. 3722 /// \param __b12 3723 /// Initializes bits [103:96] of the destination vector. 3724 /// \param __b11 3725 /// Initializes bits [95:88] of the destination vector. 3726 /// \param __b10 3727 /// Initializes bits [87:80] of the destination vector. 3728 /// \param __b9 3729 /// Initializes bits [79:72] of the destination vector. 3730 /// \param __b8 3731 /// Initializes bits [71:64] of the destination vector. 3732 /// \param __b7 3733 /// Initializes bits [63:56] of the destination vector. 3734 /// \param __b6 3735 /// Initializes bits [55:48] of the destination vector. 3736 /// \param __b5 3737 /// Initializes bits [47:40] of the destination vector. 3738 /// \param __b4 3739 /// Initializes bits [39:32] of the destination vector. 3740 /// \param __b3 3741 /// Initializes bits [31:24] of the destination vector. 3742 /// \param __b2 3743 /// Initializes bits [23:16] of the destination vector. 3744 /// \param __b1 3745 /// Initializes bits [15:8] of the destination vector. 3746 /// \param __b0 3747 /// Initializes bits [7:0] of the destination vector. 3748 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3749 /// provided in the operands. 3750 static __inline__ __m128i __DEFAULT_FN_ATTRS 3751 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 3752 { 3753 return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3754 } 3755 3756 /// Initializes both values in a 128-bit integer vector with the 3757 /// specified 64-bit integer value. 3758 /// 3759 /// \headerfile <x86intrin.h> 3760 /// 3761 /// This intrinsic is a utility function and does not correspond to a specific 3762 /// instruction. 3763 /// 3764 /// \param __q 3765 /// Integer value used to initialize the elements of the destination integer 3766 /// vector. 3767 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3768 /// elements containing the value provided in the operand. 3769 static __inline__ __m128i __DEFAULT_FN_ATTRS 3770 _mm_set1_epi64x(long long __q) 3771 { 3772 return _mm_set_epi64x(__q, __q); 3773 } 3774 3775 /// Initializes both values in a 128-bit vector of [2 x i64] with the 3776 /// specified 64-bit value. 3777 /// 3778 /// \headerfile <x86intrin.h> 3779 /// 3780 /// This intrinsic is a utility function and does not correspond to a specific 3781 /// instruction. 3782 /// 3783 /// \param __q 3784 /// A 64-bit value used to initialize the elements of the destination integer 3785 /// vector. 3786 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3787 /// containing the value provided in the operand. 3788 static __inline__ __m128i __DEFAULT_FN_ATTRS 3789 _mm_set1_epi64(__m64 __q) 3790 { 3791 return _mm_set_epi64(__q, __q); 3792 } 3793 3794 /// Initializes all values in a 128-bit vector of [4 x i32] with the 3795 /// specified 32-bit value. 3796 /// 3797 /// \headerfile <x86intrin.h> 3798 /// 3799 /// This intrinsic is a utility function and does not correspond to a specific 3800 /// instruction. 3801 /// 3802 /// \param __i 3803 /// A 32-bit value used to initialize the elements of the destination integer 3804 /// vector. 3805 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3806 /// containing the value provided in the operand. 3807 static __inline__ __m128i __DEFAULT_FN_ATTRS 3808 _mm_set1_epi32(int __i) 3809 { 3810 return _mm_set_epi32(__i, __i, __i, __i); 3811 } 3812 3813 /// Initializes all values in a 128-bit vector of [8 x i16] with the 3814 /// specified 16-bit value. 3815 /// 3816 /// \headerfile <x86intrin.h> 3817 /// 3818 /// This intrinsic is a utility function and does not correspond to a specific 3819 /// instruction. 3820 /// 3821 /// \param __w 3822 /// A 16-bit value used to initialize the elements of the destination integer 3823 /// vector. 3824 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3825 /// containing the value provided in the operand. 3826 static __inline__ __m128i __DEFAULT_FN_ATTRS 3827 _mm_set1_epi16(short __w) 3828 { 3829 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3830 } 3831 3832 /// Initializes all values in a 128-bit vector of [16 x i8] with the 3833 /// specified 8-bit value. 3834 /// 3835 /// \headerfile <x86intrin.h> 3836 /// 3837 /// This intrinsic is a utility function and does not correspond to a specific 3838 /// instruction. 3839 /// 3840 /// \param __b 3841 /// An 8-bit value used to initialize the elements of the destination integer 3842 /// vector. 3843 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3844 /// containing the value provided in the operand. 3845 static __inline__ __m128i __DEFAULT_FN_ATTRS 3846 _mm_set1_epi8(char __b) 3847 { 3848 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); 3849 } 3850 3851 /// Constructs a 128-bit integer vector, initialized in reverse order 3852 /// with the specified 64-bit integral values. 3853 /// 3854 /// \headerfile <x86intrin.h> 3855 /// 3856 /// This intrinsic does not correspond to a specific instruction. 3857 /// 3858 /// \param __q0 3859 /// A 64-bit integral value used to initialize the lower 64 bits of the 3860 /// result. 3861 /// \param __q1 3862 /// A 64-bit integral value used to initialize the upper 64 bits of the 3863 /// result. 3864 /// \returns An initialized 128-bit integer vector. 3865 static __inline__ __m128i __DEFAULT_FN_ATTRS 3866 _mm_setr_epi64(__m64 __q0, __m64 __q1) 3867 { 3868 return _mm_set_epi64(__q1, __q0); 3869 } 3870 3871 /// Constructs a 128-bit integer vector, initialized in reverse order 3872 /// with the specified 32-bit integral values. 3873 /// 3874 /// \headerfile <x86intrin.h> 3875 /// 3876 /// This intrinsic is a utility function and does not correspond to a specific 3877 /// instruction. 3878 /// 3879 /// \param __i0 3880 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3881 /// \param __i1 3882 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3883 /// \param __i2 3884 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3885 /// \param __i3 3886 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3887 /// \returns An initialized 128-bit integer vector. 3888 static __inline__ __m128i __DEFAULT_FN_ATTRS 3889 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 3890 { 3891 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3892 } 3893 3894 /// Constructs a 128-bit integer vector, initialized in reverse order 3895 /// with the specified 16-bit integral values. 3896 /// 3897 /// \headerfile <x86intrin.h> 3898 /// 3899 /// This intrinsic is a utility function and does not correspond to a specific 3900 /// instruction. 3901 /// 3902 /// \param __w0 3903 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3904 /// \param __w1 3905 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3906 /// \param __w2 3907 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3908 /// \param __w3 3909 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3910 /// \param __w4 3911 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3912 /// \param __w5 3913 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3914 /// \param __w6 3915 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3916 /// \param __w7 3917 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3918 /// \returns An initialized 128-bit integer vector. 3919 static __inline__ __m128i __DEFAULT_FN_ATTRS 3920 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 3921 { 3922 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3923 } 3924 3925 /// Constructs a 128-bit integer vector, initialized in reverse order 3926 /// with the specified 8-bit integral values. 3927 /// 3928 /// \headerfile <x86intrin.h> 3929 /// 3930 /// This intrinsic is a utility function and does not correspond to a specific 3931 /// instruction. 3932 /// 3933 /// \param __b0 3934 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3935 /// \param __b1 3936 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3937 /// \param __b2 3938 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3939 /// \param __b3 3940 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3941 /// \param __b4 3942 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3943 /// \param __b5 3944 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3945 /// \param __b6 3946 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3947 /// \param __b7 3948 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3949 /// \param __b8 3950 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3951 /// \param __b9 3952 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3953 /// \param __b10 3954 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3955 /// \param __b11 3956 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3957 /// \param __b12 3958 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3959 /// \param __b13 3960 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3961 /// \param __b14 3962 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3963 /// \param __b15 3964 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3965 /// \returns An initialized 128-bit integer vector. 3966 static __inline__ __m128i __DEFAULT_FN_ATTRS 3967 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 3968 { 3969 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3970 } 3971 3972 /// Creates a 128-bit integer vector initialized to zero. 3973 /// 3974 /// \headerfile <x86intrin.h> 3975 /// 3976 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3977 /// 3978 /// \returns An initialized 128-bit integer vector with all elements set to 3979 /// zero. 3980 static __inline__ __m128i __DEFAULT_FN_ATTRS 3981 _mm_setzero_si128(void) 3982 { 3983 return __extension__ (__m128i)(__v2di){ 0LL, 0LL }; 3984 } 3985 3986 /// Stores a 128-bit integer vector to a memory location aligned on a 3987 /// 128-bit boundary. 3988 /// 3989 /// \headerfile <x86intrin.h> 3990 /// 3991 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3992 /// 3993 /// \param __p 3994 /// A pointer to an aligned memory location that will receive the integer 3995 /// values. 3996 /// \param __b 3997 /// A 128-bit integer vector containing the values to be moved. 3998 static __inline__ void __DEFAULT_FN_ATTRS 3999 _mm_store_si128(__m128i *__p, __m128i __b) 4000 { 4001 *__p = __b; 4002 } 4003 4004 /// Stores a 128-bit integer vector to an unaligned memory location. 4005 /// 4006 /// \headerfile <x86intrin.h> 4007 /// 4008 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 4009 /// 4010 /// \param __p 4011 /// A pointer to a memory location that will receive the integer values. 4012 /// \param __b 4013 /// A 128-bit integer vector containing the values to be moved. 4014 static __inline__ void __DEFAULT_FN_ATTRS 4015 _mm_storeu_si128(__m128i_u *__p, __m128i __b) 4016 { 4017 struct __storeu_si128 { 4018 __m128i_u __v; 4019 } __attribute__((__packed__, __may_alias__)); 4020 ((struct __storeu_si128*)__p)->__v = __b; 4021 } 4022 4023 /// Stores a 64-bit integer value from the low element of a 128-bit integer 4024 /// vector. 4025 /// 4026 /// \headerfile <x86intrin.h> 4027 /// 4028 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4029 /// 4030 /// \param __p 4031 /// A pointer to a 64-bit memory location. The address of the memory 4032 /// location does not have to be aligned. 4033 /// \param __b 4034 /// A 128-bit integer vector containing the value to be stored. 4035 static __inline__ void __DEFAULT_FN_ATTRS 4036 _mm_storeu_si64(void *__p, __m128i __b) 4037 { 4038 struct __storeu_si64 { 4039 long long __v; 4040 } __attribute__((__packed__, __may_alias__)); 4041 ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; 4042 } 4043 4044 /// Stores a 32-bit integer value from the low element of a 128-bit integer 4045 /// vector. 4046 /// 4047 /// \headerfile <x86intrin.h> 4048 /// 4049 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 4050 /// 4051 /// \param __p 4052 /// A pointer to a 32-bit memory location. The address of the memory 4053 /// location does not have to be aligned. 4054 /// \param __b 4055 /// A 128-bit integer vector containing the value to be stored. 4056 static __inline__ void __DEFAULT_FN_ATTRS 4057 _mm_storeu_si32(void *__p, __m128i __b) 4058 { 4059 struct __storeu_si32 { 4060 int __v; 4061 } __attribute__((__packed__, __may_alias__)); 4062 ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; 4063 } 4064 4065 /// Stores a 16-bit integer value from the low element of a 128-bit integer 4066 /// vector. 4067 /// 4068 /// \headerfile <x86intrin.h> 4069 /// 4070 /// This intrinsic does not correspond to a specific instruction. 4071 /// 4072 /// \param __p 4073 /// A pointer to a 16-bit memory location. The address of the memory 4074 /// location does not have to be aligned. 4075 /// \param __b 4076 /// A 128-bit integer vector containing the value to be stored. 4077 static __inline__ void __DEFAULT_FN_ATTRS 4078 _mm_storeu_si16(void *__p, __m128i __b) 4079 { 4080 struct __storeu_si16 { 4081 short __v; 4082 } __attribute__((__packed__, __may_alias__)); 4083 ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; 4084 } 4085 4086 /// Moves bytes selected by the mask from the first operand to the 4087 /// specified unaligned memory location. When a mask bit is 1, the 4088 /// corresponding byte is written, otherwise it is not written. 4089 /// 4090 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4091 /// used again soon). Exception and trap behavior for elements not selected 4092 /// for storage to memory are implementation dependent. 4093 /// 4094 /// \headerfile <x86intrin.h> 4095 /// 4096 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 4097 /// instruction. 4098 /// 4099 /// \param __d 4100 /// A 128-bit integer vector containing the values to be moved. 4101 /// \param __n 4102 /// A 128-bit integer vector containing the mask. The most significant bit of 4103 /// each byte represents the mask bits. 4104 /// \param __p 4105 /// A pointer to an unaligned 128-bit memory location where the specified 4106 /// values are moved. 4107 static __inline__ void __DEFAULT_FN_ATTRS 4108 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 4109 { 4110 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 4111 } 4112 4113 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 4114 /// a memory location. 4115 /// 4116 /// \headerfile <x86intrin.h> 4117 /// 4118 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 4119 /// 4120 /// \param __p 4121 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 4122 /// of the integer vector parameter. 4123 /// \param __a 4124 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 4125 /// value to be stored. 4126 static __inline__ void __DEFAULT_FN_ATTRS 4127 _mm_storel_epi64(__m128i_u *__p, __m128i __a) 4128 { 4129 struct __mm_storel_epi64_struct { 4130 long long __u; 4131 } __attribute__((__packed__, __may_alias__)); 4132 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 4133 } 4134 4135 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 4136 /// aligned memory location. 4137 /// 4138 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4139 /// used again soon). 4140 /// 4141 /// \headerfile <x86intrin.h> 4142 /// 4143 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4144 /// 4145 /// \param __p 4146 /// A pointer to the 128-bit aligned memory location used to store the value. 4147 /// \param __a 4148 /// A vector of [2 x double] containing the 64-bit values to be stored. 4149 static __inline__ void __DEFAULT_FN_ATTRS 4150 _mm_stream_pd(double *__p, __m128d __a) 4151 { 4152 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 4153 } 4154 4155 /// Stores a 128-bit integer vector to a 128-bit aligned memory location. 4156 /// 4157 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4158 /// used again soon). 4159 /// 4160 /// \headerfile <x86intrin.h> 4161 /// 4162 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4163 /// 4164 /// \param __p 4165 /// A pointer to the 128-bit aligned memory location used to store the value. 4166 /// \param __a 4167 /// A 128-bit integer vector containing the values to be stored. 4168 static __inline__ void __DEFAULT_FN_ATTRS 4169 _mm_stream_si128(__m128i *__p, __m128i __a) 4170 { 4171 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 4172 } 4173 4174 /// Stores a 32-bit integer value in the specified memory location. 4175 /// 4176 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4177 /// used again soon). 4178 /// 4179 /// \headerfile <x86intrin.h> 4180 /// 4181 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 4182 /// 4183 /// \param __p 4184 /// A pointer to the 32-bit memory location used to store the value. 4185 /// \param __a 4186 /// A 32-bit integer containing the value to be stored. 4187 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4188 _mm_stream_si32(int *__p, int __a) 4189 { 4190 __builtin_ia32_movnti(__p, __a); 4191 } 4192 4193 #ifdef __x86_64__ 4194 /// Stores a 64-bit integer value in the specified memory location. 4195 /// 4196 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4197 /// used again soon). 4198 /// 4199 /// \headerfile <x86intrin.h> 4200 /// 4201 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4202 /// 4203 /// \param __p 4204 /// A pointer to the 64-bit memory location used to store the value. 4205 /// \param __a 4206 /// A 64-bit integer containing the value to be stored. 4207 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4208 _mm_stream_si64(long long *__p, long long __a) 4209 { 4210 __builtin_ia32_movnti64(__p, __a); 4211 } 4212 #endif 4213 4214 #if defined(__cplusplus) 4215 extern "C" { 4216 #endif 4217 4218 /// The cache line containing \a __p is flushed and invalidated from all 4219 /// caches in the coherency domain. 4220 /// 4221 /// \headerfile <x86intrin.h> 4222 /// 4223 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4224 /// 4225 /// \param __p 4226 /// A pointer to the memory location used to identify the cache line to be 4227 /// flushed. 4228 void _mm_clflush(void const * __p); 4229 4230 /// Forces strong memory ordering (serialization) between load 4231 /// instructions preceding this instruction and load instructions following 4232 /// this instruction, ensuring the system completes all previous loads before 4233 /// executing subsequent loads. 4234 /// 4235 /// \headerfile <x86intrin.h> 4236 /// 4237 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4238 /// 4239 void _mm_lfence(void); 4240 4241 /// Forces strong memory ordering (serialization) between load and store 4242 /// instructions preceding this instruction and load and store instructions 4243 /// following this instruction, ensuring that the system completes all 4244 /// previous memory accesses before executing subsequent memory accesses. 4245 /// 4246 /// \headerfile <x86intrin.h> 4247 /// 4248 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4249 /// 4250 void _mm_mfence(void); 4251 4252 #if defined(__cplusplus) 4253 } // extern "C" 4254 #endif 4255 4256 /// Converts 16-bit signed integers from both 128-bit integer vector 4257 /// operands into 8-bit signed integers, and packs the results into the 4258 /// destination. Positive values greater than 0x7F are saturated to 0x7F. 4259 /// Negative values less than 0x80 are saturated to 0x80. 4260 /// 4261 /// \headerfile <x86intrin.h> 4262 /// 4263 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4264 /// 4265 /// \param __a 4266 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4267 /// a signed integer and is converted to a 8-bit signed integer with 4268 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4269 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4270 /// written to the lower 64 bits of the result. 4271 /// \param __b 4272 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4273 /// a signed integer and is converted to a 8-bit signed integer with 4274 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4275 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4276 /// written to the higher 64 bits of the result. 4277 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4278 static __inline__ __m128i __DEFAULT_FN_ATTRS 4279 _mm_packs_epi16(__m128i __a, __m128i __b) 4280 { 4281 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4282 } 4283 4284 /// Converts 32-bit signed integers from both 128-bit integer vector 4285 /// operands into 16-bit signed integers, and packs the results into the 4286 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4287 /// Negative values less than 0x8000 are saturated to 0x8000. 4288 /// 4289 /// \headerfile <x86intrin.h> 4290 /// 4291 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4292 /// 4293 /// \param __a 4294 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4295 /// a signed integer and is converted to a 16-bit signed integer with 4296 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4297 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4298 /// are written to the lower 64 bits of the result. 4299 /// \param __b 4300 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4301 /// a signed integer and is converted to a 16-bit signed integer with 4302 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4303 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4304 /// are written to the higher 64 bits of the result. 4305 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4306 static __inline__ __m128i __DEFAULT_FN_ATTRS 4307 _mm_packs_epi32(__m128i __a, __m128i __b) 4308 { 4309 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4310 } 4311 4312 /// Converts 16-bit signed integers from both 128-bit integer vector 4313 /// operands into 8-bit unsigned integers, and packs the results into the 4314 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4315 /// than 0x00 are saturated to 0x00. 4316 /// 4317 /// \headerfile <x86intrin.h> 4318 /// 4319 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4320 /// 4321 /// \param __a 4322 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4323 /// a signed integer and is converted to an 8-bit unsigned integer with 4324 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4325 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4326 /// written to the lower 64 bits of the result. 4327 /// \param __b 4328 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4329 /// a signed integer and is converted to an 8-bit unsigned integer with 4330 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4331 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4332 /// written to the higher 64 bits of the result. 4333 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4334 static __inline__ __m128i __DEFAULT_FN_ATTRS 4335 _mm_packus_epi16(__m128i __a, __m128i __b) 4336 { 4337 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4338 } 4339 4340 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4341 /// the immediate-value parameter as a selector. 4342 /// 4343 /// \headerfile <x86intrin.h> 4344 /// 4345 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4346 /// 4347 /// \param __a 4348 /// A 128-bit integer vector. 4349 /// \param __imm 4350 /// An immediate value. Bits [2:0] selects values from \a __a to be assigned 4351 /// to bits[15:0] of the result. \n 4352 /// 000: assign values from bits [15:0] of \a __a. \n 4353 /// 001: assign values from bits [31:16] of \a __a. \n 4354 /// 010: assign values from bits [47:32] of \a __a. \n 4355 /// 011: assign values from bits [63:48] of \a __a. \n 4356 /// 100: assign values from bits [79:64] of \a __a. \n 4357 /// 101: assign values from bits [95:80] of \a __a. \n 4358 /// 110: assign values from bits [111:96] of \a __a. \n 4359 /// 111: assign values from bits [127:112] of \a __a. 4360 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4361 /// integer vector parameter and the remaining bits are assigned zeros. 4362 #define _mm_extract_epi16(a, imm) \ 4363 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4364 (int)(imm))) 4365 4366 /// Constructs a 128-bit integer vector by first making a copy of the 4367 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4368 /// of an integer parameter into an offset specified by the immediate-value 4369 /// parameter. 4370 /// 4371 /// \headerfile <x86intrin.h> 4372 /// 4373 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4374 /// 4375 /// \param __a 4376 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4377 /// result and then one of the eight elements in the result is replaced by 4378 /// the lower 16 bits of \a __b. 4379 /// \param __b 4380 /// An integer. The lower 16 bits of this parameter are written to the 4381 /// result beginning at an offset specified by \a __imm. 4382 /// \param __imm 4383 /// An immediate value specifying the bit offset in the result at which the 4384 /// lower 16 bits of \a __b are written. 4385 /// \returns A 128-bit integer vector containing the constructed values. 4386 #define _mm_insert_epi16(a, b, imm) \ 4387 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4388 (int)(imm))) 4389 4390 /// Copies the values of the most significant bits from each 8-bit 4391 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4392 /// value, zero-extends the value, and writes it to the destination. 4393 /// 4394 /// \headerfile <x86intrin.h> 4395 /// 4396 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4397 /// 4398 /// \param __a 4399 /// A 128-bit integer vector containing the values with bits to be extracted. 4400 /// \returns The most significant bits from each 8-bit element in \a __a, 4401 /// written to bits [15:0]. The other bits are assigned zeros. 4402 static __inline__ int __DEFAULT_FN_ATTRS 4403 _mm_movemask_epi8(__m128i __a) 4404 { 4405 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4406 } 4407 4408 /// Constructs a 128-bit integer vector by shuffling four 32-bit 4409 /// elements of a 128-bit integer vector parameter, using the immediate-value 4410 /// parameter as a specifier. 4411 /// 4412 /// \headerfile <x86intrin.h> 4413 /// 4414 /// \code 4415 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4416 /// \endcode 4417 /// 4418 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4419 /// 4420 /// \param a 4421 /// A 128-bit integer vector containing the values to be copied. 4422 /// \param imm 4423 /// An immediate value containing an 8-bit value specifying which elements to 4424 /// copy from a. The destinations within the 128-bit destination are assigned 4425 /// values as follows: \n 4426 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4427 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4428 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4429 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4430 /// Bit value assignments: \n 4431 /// 00: assign values from bits [31:0] of \a a. \n 4432 /// 01: assign values from bits [63:32] of \a a. \n 4433 /// 10: assign values from bits [95:64] of \a a. \n 4434 /// 11: assign values from bits [127:96] of \a a. 4435 /// \returns A 128-bit integer vector containing the shuffled values. 4436 #define _mm_shuffle_epi32(a, imm) \ 4437 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 4438 4439 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4440 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4441 /// value parameter as a specifier. 4442 /// 4443 /// \headerfile <x86intrin.h> 4444 /// 4445 /// \code 4446 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4447 /// \endcode 4448 /// 4449 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4450 /// 4451 /// \param a 4452 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4453 /// [127:64] of the result. 4454 /// \param imm 4455 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4456 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4457 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4458 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4459 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4460 /// Bit value assignments: \n 4461 /// 00: assign values from bits [15:0] of \a a. \n 4462 /// 01: assign values from bits [31:16] of \a a. \n 4463 /// 10: assign values from bits [47:32] of \a a. \n 4464 /// 11: assign values from bits [63:48] of \a a. \n 4465 /// \returns A 128-bit integer vector containing the shuffled values. 4466 #define _mm_shufflelo_epi16(a, imm) \ 4467 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 4468 4469 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4470 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4471 /// value parameter as a specifier. 4472 /// 4473 /// \headerfile <x86intrin.h> 4474 /// 4475 /// \code 4476 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4477 /// \endcode 4478 /// 4479 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4480 /// 4481 /// \param a 4482 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4483 /// [63:0] of the result. 4484 /// \param imm 4485 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4486 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4487 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4488 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4489 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4490 /// Bit value assignments: \n 4491 /// 00: assign values from bits [79:64] of \a a. \n 4492 /// 01: assign values from bits [95:80] of \a a. \n 4493 /// 10: assign values from bits [111:96] of \a a. \n 4494 /// 11: assign values from bits [127:112] of \a a. \n 4495 /// \returns A 128-bit integer vector containing the shuffled values. 4496 #define _mm_shufflehi_epi16(a, imm) \ 4497 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 4498 4499 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4500 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4501 /// 4502 /// \headerfile <x86intrin.h> 4503 /// 4504 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4505 /// instruction. 4506 /// 4507 /// \param __a 4508 /// A 128-bit vector of [16 x i8]. 4509 /// Bits [71:64] are written to bits [7:0] of the result. \n 4510 /// Bits [79:72] are written to bits [23:16] of the result. \n 4511 /// Bits [87:80] are written to bits [39:32] of the result. \n 4512 /// Bits [95:88] are written to bits [55:48] of the result. \n 4513 /// Bits [103:96] are written to bits [71:64] of the result. \n 4514 /// Bits [111:104] are written to bits [87:80] of the result. \n 4515 /// Bits [119:112] are written to bits [103:96] of the result. \n 4516 /// Bits [127:120] are written to bits [119:112] of the result. 4517 /// \param __b 4518 /// A 128-bit vector of [16 x i8]. \n 4519 /// Bits [71:64] are written to bits [15:8] of the result. \n 4520 /// Bits [79:72] are written to bits [31:24] of the result. \n 4521 /// Bits [87:80] are written to bits [47:40] of the result. \n 4522 /// Bits [95:88] are written to bits [63:56] of the result. \n 4523 /// Bits [103:96] are written to bits [79:72] of the result. \n 4524 /// Bits [111:104] are written to bits [95:88] of the result. \n 4525 /// Bits [119:112] are written to bits [111:104] of the result. \n 4526 /// Bits [127:120] are written to bits [127:120] of the result. 4527 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4528 static __inline__ __m128i __DEFAULT_FN_ATTRS 4529 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 4530 { 4531 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 4532 } 4533 4534 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4535 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4536 /// 4537 /// \headerfile <x86intrin.h> 4538 /// 4539 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4540 /// instruction. 4541 /// 4542 /// \param __a 4543 /// A 128-bit vector of [8 x i16]. 4544 /// Bits [79:64] are written to bits [15:0] of the result. \n 4545 /// Bits [95:80] are written to bits [47:32] of the result. \n 4546 /// Bits [111:96] are written to bits [79:64] of the result. \n 4547 /// Bits [127:112] are written to bits [111:96] of the result. 4548 /// \param __b 4549 /// A 128-bit vector of [8 x i16]. 4550 /// Bits [79:64] are written to bits [31:16] of the result. \n 4551 /// Bits [95:80] are written to bits [63:48] of the result. \n 4552 /// Bits [111:96] are written to bits [95:80] of the result. \n 4553 /// Bits [127:112] are written to bits [127:112] of the result. 4554 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4555 static __inline__ __m128i __DEFAULT_FN_ATTRS 4556 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 4557 { 4558 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 4559 } 4560 4561 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4562 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4563 /// 4564 /// \headerfile <x86intrin.h> 4565 /// 4566 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4567 /// instruction. 4568 /// 4569 /// \param __a 4570 /// A 128-bit vector of [4 x i32]. \n 4571 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4572 /// Bits [127:96] are written to bits [95:64] of the destination. 4573 /// \param __b 4574 /// A 128-bit vector of [4 x i32]. \n 4575 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4576 /// Bits [127:96] are written to bits [127:96] of the destination. 4577 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4578 static __inline__ __m128i __DEFAULT_FN_ATTRS 4579 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 4580 { 4581 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 4582 } 4583 4584 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4585 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4586 /// 4587 /// \headerfile <x86intrin.h> 4588 /// 4589 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4590 /// instruction. 4591 /// 4592 /// \param __a 4593 /// A 128-bit vector of [2 x i64]. \n 4594 /// Bits [127:64] are written to bits [63:0] of the destination. 4595 /// \param __b 4596 /// A 128-bit vector of [2 x i64]. \n 4597 /// Bits [127:64] are written to bits [127:64] of the destination. 4598 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4599 static __inline__ __m128i __DEFAULT_FN_ATTRS 4600 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 4601 { 4602 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 4603 } 4604 4605 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4606 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4607 /// 4608 /// \headerfile <x86intrin.h> 4609 /// 4610 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4611 /// instruction. 4612 /// 4613 /// \param __a 4614 /// A 128-bit vector of [16 x i8]. \n 4615 /// Bits [7:0] are written to bits [7:0] of the result. \n 4616 /// Bits [15:8] are written to bits [23:16] of the result. \n 4617 /// Bits [23:16] are written to bits [39:32] of the result. \n 4618 /// Bits [31:24] are written to bits [55:48] of the result. \n 4619 /// Bits [39:32] are written to bits [71:64] of the result. \n 4620 /// Bits [47:40] are written to bits [87:80] of the result. \n 4621 /// Bits [55:48] are written to bits [103:96] of the result. \n 4622 /// Bits [63:56] are written to bits [119:112] of the result. 4623 /// \param __b 4624 /// A 128-bit vector of [16 x i8]. 4625 /// Bits [7:0] are written to bits [15:8] of the result. \n 4626 /// Bits [15:8] are written to bits [31:24] of the result. \n 4627 /// Bits [23:16] are written to bits [47:40] of the result. \n 4628 /// Bits [31:24] are written to bits [63:56] of the result. \n 4629 /// Bits [39:32] are written to bits [79:72] of the result. \n 4630 /// Bits [47:40] are written to bits [95:88] of the result. \n 4631 /// Bits [55:48] are written to bits [111:104] of the result. \n 4632 /// Bits [63:56] are written to bits [127:120] of the result. 4633 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4634 static __inline__ __m128i __DEFAULT_FN_ATTRS 4635 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 4636 { 4637 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 4638 } 4639 4640 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4641 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4642 /// [8 x i16]. 4643 /// 4644 /// \headerfile <x86intrin.h> 4645 /// 4646 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4647 /// instruction. 4648 /// 4649 /// \param __a 4650 /// A 128-bit vector of [8 x i16]. 4651 /// Bits [15:0] are written to bits [15:0] of the result. \n 4652 /// Bits [31:16] are written to bits [47:32] of the result. \n 4653 /// Bits [47:32] are written to bits [79:64] of the result. \n 4654 /// Bits [63:48] are written to bits [111:96] of the result. 4655 /// \param __b 4656 /// A 128-bit vector of [8 x i16]. 4657 /// Bits [15:0] are written to bits [31:16] of the result. \n 4658 /// Bits [31:16] are written to bits [63:48] of the result. \n 4659 /// Bits [47:32] are written to bits [95:80] of the result. \n 4660 /// Bits [63:48] are written to bits [127:112] of the result. 4661 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4662 static __inline__ __m128i __DEFAULT_FN_ATTRS 4663 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 4664 { 4665 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 4666 } 4667 4668 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4669 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4670 /// 4671 /// \headerfile <x86intrin.h> 4672 /// 4673 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4674 /// instruction. 4675 /// 4676 /// \param __a 4677 /// A 128-bit vector of [4 x i32]. \n 4678 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4679 /// Bits [63:32] are written to bits [95:64] of the destination. 4680 /// \param __b 4681 /// A 128-bit vector of [4 x i32]. \n 4682 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4683 /// Bits [63:32] are written to bits [127:96] of the destination. 4684 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4685 static __inline__ __m128i __DEFAULT_FN_ATTRS 4686 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 4687 { 4688 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 4689 } 4690 4691 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4692 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4693 /// 4694 /// \headerfile <x86intrin.h> 4695 /// 4696 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4697 /// instruction. 4698 /// 4699 /// \param __a 4700 /// A 128-bit vector of [2 x i64]. \n 4701 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4702 /// \param __b 4703 /// A 128-bit vector of [2 x i64]. \n 4704 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4705 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4706 static __inline__ __m128i __DEFAULT_FN_ATTRS 4707 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 4708 { 4709 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 4710 } 4711 4712 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4713 /// integer. 4714 /// 4715 /// \headerfile <x86intrin.h> 4716 /// 4717 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4718 /// 4719 /// \param __a 4720 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4721 /// destination. 4722 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4723 static __inline__ __m64 __DEFAULT_FN_ATTRS 4724 _mm_movepi64_pi64(__m128i __a) 4725 { 4726 return (__m64)__a[0]; 4727 } 4728 4729 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4730 /// upper bits. 4731 /// 4732 /// \headerfile <x86intrin.h> 4733 /// 4734 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4735 /// 4736 /// \param __a 4737 /// A 64-bit value. 4738 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4739 /// the operand. The upper 64 bits are assigned zeros. 4740 static __inline__ __m128i __DEFAULT_FN_ATTRS 4741 _mm_movpi64_epi64(__m64 __a) 4742 { 4743 return __extension__ (__m128i)(__v2di){ (long long)__a, 0 }; 4744 } 4745 4746 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4747 /// integer vector, zeroing the upper bits. 4748 /// 4749 /// \headerfile <x86intrin.h> 4750 /// 4751 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4752 /// 4753 /// \param __a 4754 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4755 /// destination. 4756 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4757 /// the operand. The upper 64 bits are assigned zeros. 4758 static __inline__ __m128i __DEFAULT_FN_ATTRS 4759 _mm_move_epi64(__m128i __a) 4760 { 4761 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4762 } 4763 4764 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4765 /// [2 x double] and interleaves them into a 128-bit vector of [2 x 4766 /// double]. 4767 /// 4768 /// \headerfile <x86intrin.h> 4769 /// 4770 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4771 /// 4772 /// \param __a 4773 /// A 128-bit vector of [2 x double]. \n 4774 /// Bits [127:64] are written to bits [63:0] of the destination. 4775 /// \param __b 4776 /// A 128-bit vector of [2 x double]. \n 4777 /// Bits [127:64] are written to bits [127:64] of the destination. 4778 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4779 static __inline__ __m128d __DEFAULT_FN_ATTRS 4780 _mm_unpackhi_pd(__m128d __a, __m128d __b) 4781 { 4782 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 4783 } 4784 4785 /// Unpacks the low-order 64-bit elements from two 128-bit vectors 4786 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4787 /// double]. 4788 /// 4789 /// \headerfile <x86intrin.h> 4790 /// 4791 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4792 /// 4793 /// \param __a 4794 /// A 128-bit vector of [2 x double]. \n 4795 /// Bits [63:0] are written to bits [63:0] of the destination. 4796 /// \param __b 4797 /// A 128-bit vector of [2 x double]. \n 4798 /// Bits [63:0] are written to bits [127:64] of the destination. 4799 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4800 static __inline__ __m128d __DEFAULT_FN_ATTRS 4801 _mm_unpacklo_pd(__m128d __a, __m128d __b) 4802 { 4803 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 4804 } 4805 4806 /// Extracts the sign bits of the double-precision values in the 128-bit 4807 /// vector of [2 x double], zero-extends the value, and writes it to the 4808 /// low-order bits of the destination. 4809 /// 4810 /// \headerfile <x86intrin.h> 4811 /// 4812 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4813 /// 4814 /// \param __a 4815 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4816 /// be extracted. 4817 /// \returns The sign bits from each of the double-precision elements in \a __a, 4818 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4819 static __inline__ int __DEFAULT_FN_ATTRS 4820 _mm_movemask_pd(__m128d __a) 4821 { 4822 return __builtin_ia32_movmskpd((__v2df)__a); 4823 } 4824 4825 4826 /// Constructs a 128-bit floating-point vector of [2 x double] from two 4827 /// 128-bit vector parameters of [2 x double], using the immediate-value 4828 /// parameter as a specifier. 4829 /// 4830 /// \headerfile <x86intrin.h> 4831 /// 4832 /// \code 4833 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4834 /// \endcode 4835 /// 4836 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4837 /// 4838 /// \param a 4839 /// A 128-bit vector of [2 x double]. 4840 /// \param b 4841 /// A 128-bit vector of [2 x double]. 4842 /// \param i 4843 /// An 8-bit immediate value. The least significant two bits specify which 4844 /// elements to copy from \a a and \a b: \n 4845 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4846 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4847 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4848 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4849 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4850 #define _mm_shuffle_pd(a, b, i) \ 4851 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4852 (int)(i))) 4853 4854 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4855 /// floating-point vector of [4 x float]. 4856 /// 4857 /// \headerfile <x86intrin.h> 4858 /// 4859 /// This intrinsic has no corresponding instruction. 4860 /// 4861 /// \param __a 4862 /// A 128-bit floating-point vector of [2 x double]. 4863 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4864 /// bitwise pattern as the parameter. 4865 static __inline__ __m128 __DEFAULT_FN_ATTRS 4866 _mm_castpd_ps(__m128d __a) 4867 { 4868 return (__m128)__a; 4869 } 4870 4871 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4872 /// integer vector. 4873 /// 4874 /// \headerfile <x86intrin.h> 4875 /// 4876 /// This intrinsic has no corresponding instruction. 4877 /// 4878 /// \param __a 4879 /// A 128-bit floating-point vector of [2 x double]. 4880 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4881 /// parameter. 4882 static __inline__ __m128i __DEFAULT_FN_ATTRS 4883 _mm_castpd_si128(__m128d __a) 4884 { 4885 return (__m128i)__a; 4886 } 4887 4888 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4889 /// floating-point vector of [2 x double]. 4890 /// 4891 /// \headerfile <x86intrin.h> 4892 /// 4893 /// This intrinsic has no corresponding instruction. 4894 /// 4895 /// \param __a 4896 /// A 128-bit floating-point vector of [4 x float]. 4897 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4898 /// bitwise pattern as the parameter. 4899 static __inline__ __m128d __DEFAULT_FN_ATTRS 4900 _mm_castps_pd(__m128 __a) 4901 { 4902 return (__m128d)__a; 4903 } 4904 4905 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4906 /// integer vector. 4907 /// 4908 /// \headerfile <x86intrin.h> 4909 /// 4910 /// This intrinsic has no corresponding instruction. 4911 /// 4912 /// \param __a 4913 /// A 128-bit floating-point vector of [4 x float]. 4914 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4915 /// parameter. 4916 static __inline__ __m128i __DEFAULT_FN_ATTRS 4917 _mm_castps_si128(__m128 __a) 4918 { 4919 return (__m128i)__a; 4920 } 4921 4922 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4923 /// of [4 x float]. 4924 /// 4925 /// \headerfile <x86intrin.h> 4926 /// 4927 /// This intrinsic has no corresponding instruction. 4928 /// 4929 /// \param __a 4930 /// A 128-bit integer vector. 4931 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4932 /// bitwise pattern as the parameter. 4933 static __inline__ __m128 __DEFAULT_FN_ATTRS 4934 _mm_castsi128_ps(__m128i __a) 4935 { 4936 return (__m128)__a; 4937 } 4938 4939 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4940 /// of [2 x double]. 4941 /// 4942 /// \headerfile <x86intrin.h> 4943 /// 4944 /// This intrinsic has no corresponding instruction. 4945 /// 4946 /// \param __a 4947 /// A 128-bit integer vector. 4948 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4949 /// bitwise pattern as the parameter. 4950 static __inline__ __m128d __DEFAULT_FN_ATTRS 4951 _mm_castsi128_pd(__m128i __a) 4952 { 4953 return (__m128d)__a; 4954 } 4955 4956 #if defined(__cplusplus) 4957 extern "C" { 4958 #endif 4959 4960 /// Indicates that a spin loop is being executed for the purposes of 4961 /// optimizing power consumption during the loop. 4962 /// 4963 /// \headerfile <x86intrin.h> 4964 /// 4965 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4966 /// 4967 void _mm_pause(void); 4968 4969 #if defined(__cplusplus) 4970 } // extern "C" 4971 #endif 4972 #undef __DEFAULT_FN_ATTRS 4973 #undef __DEFAULT_FN_ATTRS_MMX 4974 4975 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4976 4977 #define _MM_DENORMALS_ZERO_ON (0x0040U) 4978 #define _MM_DENORMALS_ZERO_OFF (0x0000U) 4979 4980 #define _MM_DENORMALS_ZERO_MASK (0x0040U) 4981 4982 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4983 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4984 4985 #endif /* __EMMINTRIN_H */ 4986