1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __EMMINTRIN_H 11 #define __EMMINTRIN_H 12 13 #include <xmmintrin.h> 14 15 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 16 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 17 18 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 19 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); 20 21 /* Type defines. */ 22 typedef double __v2df __attribute__ ((__vector_size__ (16))); 23 typedef long long __v2di __attribute__ ((__vector_size__ (16))); 24 typedef short __v8hi __attribute__((__vector_size__(16))); 25 typedef char __v16qi __attribute__((__vector_size__(16))); 26 27 /* Unsigned types */ 28 typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); 29 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 30 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 31 32 /* We need an explicitly signed variant for char. Note that this shouldn't 33 * appear in the interface though. */ 34 typedef signed char __v16qs __attribute__((__vector_size__(16))); 35 36 /* Define the default attributes for the functions in this file. */ 37 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) 38 #define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) 39 40 /// Adds lower double-precision values in both operands and returns the 41 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 42 /// are copied from the upper double-precision value of the first operand. 43 /// 44 /// \headerfile <x86intrin.h> 45 /// 46 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 47 /// 48 /// \param __a 49 /// A 128-bit vector of [2 x double] containing one of the source operands. 50 /// \param __b 51 /// A 128-bit vector of [2 x double] containing one of the source operands. 52 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 53 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 54 /// from the upper 64 bits of the first source operand. 55 static __inline__ __m128d __DEFAULT_FN_ATTRS 56 _mm_add_sd(__m128d __a, __m128d __b) 57 { 58 __a[0] += __b[0]; 59 return __a; 60 } 61 62 /// Adds two 128-bit vectors of [2 x double]. 63 /// 64 /// \headerfile <x86intrin.h> 65 /// 66 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 67 /// 68 /// \param __a 69 /// A 128-bit vector of [2 x double] containing one of the source operands. 70 /// \param __b 71 /// A 128-bit vector of [2 x double] containing one of the source operands. 72 /// \returns A 128-bit vector of [2 x double] containing the sums of both 73 /// operands. 74 static __inline__ __m128d __DEFAULT_FN_ATTRS 75 _mm_add_pd(__m128d __a, __m128d __b) 76 { 77 return (__m128d)((__v2df)__a + (__v2df)__b); 78 } 79 80 /// Subtracts the lower double-precision value of the second operand 81 /// from the lower double-precision value of the first operand and returns 82 /// the difference in the lower 64 bits of the result. The upper 64 bits of 83 /// the result are copied from the upper double-precision value of the first 84 /// operand. 85 /// 86 /// \headerfile <x86intrin.h> 87 /// 88 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 89 /// 90 /// \param __a 91 /// A 128-bit vector of [2 x double] containing the minuend. 92 /// \param __b 93 /// A 128-bit vector of [2 x double] containing the subtrahend. 94 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 95 /// difference of the lower 64 bits of both operands. The upper 64 bits are 96 /// copied from the upper 64 bits of the first source operand. 97 static __inline__ __m128d __DEFAULT_FN_ATTRS 98 _mm_sub_sd(__m128d __a, __m128d __b) 99 { 100 __a[0] -= __b[0]; 101 return __a; 102 } 103 104 /// Subtracts two 128-bit vectors of [2 x double]. 105 /// 106 /// \headerfile <x86intrin.h> 107 /// 108 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 109 /// 110 /// \param __a 111 /// A 128-bit vector of [2 x double] containing the minuend. 112 /// \param __b 113 /// A 128-bit vector of [2 x double] containing the subtrahend. 114 /// \returns A 128-bit vector of [2 x double] containing the differences between 115 /// both operands. 116 static __inline__ __m128d __DEFAULT_FN_ATTRS 117 _mm_sub_pd(__m128d __a, __m128d __b) 118 { 119 return (__m128d)((__v2df)__a - (__v2df)__b); 120 } 121 122 /// Multiplies lower double-precision values in both operands and returns 123 /// the product in the lower 64 bits of the result. The upper 64 bits of the 124 /// result are copied from the upper double-precision value of the first 125 /// operand. 126 /// 127 /// \headerfile <x86intrin.h> 128 /// 129 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 130 /// 131 /// \param __a 132 /// A 128-bit vector of [2 x double] containing one of the source operands. 133 /// \param __b 134 /// A 128-bit vector of [2 x double] containing one of the source operands. 135 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 136 /// product of the lower 64 bits of both operands. The upper 64 bits are 137 /// copied from the upper 64 bits of the first source operand. 138 static __inline__ __m128d __DEFAULT_FN_ATTRS 139 _mm_mul_sd(__m128d __a, __m128d __b) 140 { 141 __a[0] *= __b[0]; 142 return __a; 143 } 144 145 /// Multiplies two 128-bit vectors of [2 x double]. 146 /// 147 /// \headerfile <x86intrin.h> 148 /// 149 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 150 /// 151 /// \param __a 152 /// A 128-bit vector of [2 x double] containing one of the operands. 153 /// \param __b 154 /// A 128-bit vector of [2 x double] containing one of the operands. 155 /// \returns A 128-bit vector of [2 x double] containing the products of both 156 /// operands. 157 static __inline__ __m128d __DEFAULT_FN_ATTRS 158 _mm_mul_pd(__m128d __a, __m128d __b) 159 { 160 return (__m128d)((__v2df)__a * (__v2df)__b); 161 } 162 163 /// Divides the lower double-precision value of the first operand by the 164 /// lower double-precision value of the second operand and returns the 165 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 166 /// result are copied from the upper double-precision value of the first 167 /// operand. 168 /// 169 /// \headerfile <x86intrin.h> 170 /// 171 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 172 /// 173 /// \param __a 174 /// A 128-bit vector of [2 x double] containing the dividend. 175 /// \param __b 176 /// A 128-bit vector of [2 x double] containing divisor. 177 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 178 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 179 /// copied from the upper 64 bits of the first source operand. 180 static __inline__ __m128d __DEFAULT_FN_ATTRS 181 _mm_div_sd(__m128d __a, __m128d __b) 182 { 183 __a[0] /= __b[0]; 184 return __a; 185 } 186 187 /// Performs an element-by-element division of two 128-bit vectors of 188 /// [2 x double]. 189 /// 190 /// \headerfile <x86intrin.h> 191 /// 192 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 193 /// 194 /// \param __a 195 /// A 128-bit vector of [2 x double] containing the dividend. 196 /// \param __b 197 /// A 128-bit vector of [2 x double] containing the divisor. 198 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 199 /// operands. 200 static __inline__ __m128d __DEFAULT_FN_ATTRS 201 _mm_div_pd(__m128d __a, __m128d __b) 202 { 203 return (__m128d)((__v2df)__a / (__v2df)__b); 204 } 205 206 /// Calculates the square root of the lower double-precision value of 207 /// the second operand and returns it in the lower 64 bits of the result. 208 /// The upper 64 bits of the result are copied from the upper 209 /// double-precision value of the first operand. 210 /// 211 /// \headerfile <x86intrin.h> 212 /// 213 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 214 /// 215 /// \param __a 216 /// A 128-bit vector of [2 x double] containing one of the operands. The 217 /// upper 64 bits of this operand are copied to the upper 64 bits of the 218 /// result. 219 /// \param __b 220 /// A 128-bit vector of [2 x double] containing one of the operands. The 221 /// square root is calculated using the lower 64 bits of this operand. 222 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 223 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 224 /// bits are copied from the upper 64 bits of operand \a __a. 225 static __inline__ __m128d __DEFAULT_FN_ATTRS 226 _mm_sqrt_sd(__m128d __a, __m128d __b) 227 { 228 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 229 return __extension__ (__m128d) { __c[0], __a[1] }; 230 } 231 232 /// Calculates the square root of the each of two values stored in a 233 /// 128-bit vector of [2 x double]. 234 /// 235 /// \headerfile <x86intrin.h> 236 /// 237 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 238 /// 239 /// \param __a 240 /// A 128-bit vector of [2 x double]. 241 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 242 /// values in the operand. 243 static __inline__ __m128d __DEFAULT_FN_ATTRS 244 _mm_sqrt_pd(__m128d __a) 245 { 246 return __builtin_ia32_sqrtpd((__v2df)__a); 247 } 248 249 /// Compares lower 64-bit double-precision values of both operands, and 250 /// returns the lesser of the pair of values in the lower 64-bits of the 251 /// result. The upper 64 bits of the result are copied from the upper 252 /// double-precision value of the first operand. 253 /// 254 /// \headerfile <x86intrin.h> 255 /// 256 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 257 /// 258 /// \param __a 259 /// A 128-bit vector of [2 x double] containing one of the operands. The 260 /// lower 64 bits of this operand are used in the comparison. 261 /// \param __b 262 /// A 128-bit vector of [2 x double] containing one of the operands. The 263 /// lower 64 bits of this operand are used in the comparison. 264 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 265 /// minimum value between both operands. The upper 64 bits are copied from 266 /// the upper 64 bits of the first source operand. 267 static __inline__ __m128d __DEFAULT_FN_ATTRS 268 _mm_min_sd(__m128d __a, __m128d __b) 269 { 270 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 271 } 272 273 /// Performs element-by-element comparison of the two 128-bit vectors of 274 /// [2 x double] and returns the vector containing the lesser of each pair of 275 /// values. 276 /// 277 /// \headerfile <x86intrin.h> 278 /// 279 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 280 /// 281 /// \param __a 282 /// A 128-bit vector of [2 x double] containing one of the operands. 283 /// \param __b 284 /// A 128-bit vector of [2 x double] containing one of the operands. 285 /// \returns A 128-bit vector of [2 x double] containing the minimum values 286 /// between both operands. 287 static __inline__ __m128d __DEFAULT_FN_ATTRS 288 _mm_min_pd(__m128d __a, __m128d __b) 289 { 290 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 291 } 292 293 /// Compares lower 64-bit double-precision values of both operands, and 294 /// returns the greater of the pair of values in the lower 64-bits of the 295 /// result. The upper 64 bits of the result are copied from the upper 296 /// double-precision value of the first operand. 297 /// 298 /// \headerfile <x86intrin.h> 299 /// 300 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 301 /// 302 /// \param __a 303 /// A 128-bit vector of [2 x double] containing one of the operands. The 304 /// lower 64 bits of this operand are used in the comparison. 305 /// \param __b 306 /// A 128-bit vector of [2 x double] containing one of the operands. The 307 /// lower 64 bits of this operand are used in the comparison. 308 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 309 /// maximum value between both operands. The upper 64 bits are copied from 310 /// the upper 64 bits of the first source operand. 311 static __inline__ __m128d __DEFAULT_FN_ATTRS 312 _mm_max_sd(__m128d __a, __m128d __b) 313 { 314 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 315 } 316 317 /// Performs element-by-element comparison of the two 128-bit vectors of 318 /// [2 x double] and returns the vector containing the greater of each pair 319 /// of values. 320 /// 321 /// \headerfile <x86intrin.h> 322 /// 323 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 324 /// 325 /// \param __a 326 /// A 128-bit vector of [2 x double] containing one of the operands. 327 /// \param __b 328 /// A 128-bit vector of [2 x double] containing one of the operands. 329 /// \returns A 128-bit vector of [2 x double] containing the maximum values 330 /// between both operands. 331 static __inline__ __m128d __DEFAULT_FN_ATTRS 332 _mm_max_pd(__m128d __a, __m128d __b) 333 { 334 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 335 } 336 337 /// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 338 /// 339 /// \headerfile <x86intrin.h> 340 /// 341 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 342 /// 343 /// \param __a 344 /// A 128-bit vector of [2 x double] containing one of the source operands. 345 /// \param __b 346 /// A 128-bit vector of [2 x double] containing one of the source operands. 347 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 348 /// values between both operands. 349 static __inline__ __m128d __DEFAULT_FN_ATTRS 350 _mm_and_pd(__m128d __a, __m128d __b) 351 { 352 return (__m128d)((__v2du)__a & (__v2du)__b); 353 } 354 355 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 356 /// the one's complement of the values contained in the first source operand. 357 /// 358 /// \headerfile <x86intrin.h> 359 /// 360 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 361 /// 362 /// \param __a 363 /// A 128-bit vector of [2 x double] containing the left source operand. The 364 /// one's complement of this value is used in the bitwise AND. 365 /// \param __b 366 /// A 128-bit vector of [2 x double] containing the right source operand. 367 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 368 /// values in the second operand and the one's complement of the first 369 /// operand. 370 static __inline__ __m128d __DEFAULT_FN_ATTRS 371 _mm_andnot_pd(__m128d __a, __m128d __b) 372 { 373 return (__m128d)(~(__v2du)__a & (__v2du)__b); 374 } 375 376 /// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 377 /// 378 /// \headerfile <x86intrin.h> 379 /// 380 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 381 /// 382 /// \param __a 383 /// A 128-bit vector of [2 x double] containing one of the source operands. 384 /// \param __b 385 /// A 128-bit vector of [2 x double] containing one of the source operands. 386 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 387 /// values between both operands. 388 static __inline__ __m128d __DEFAULT_FN_ATTRS 389 _mm_or_pd(__m128d __a, __m128d __b) 390 { 391 return (__m128d)((__v2du)__a | (__v2du)__b); 392 } 393 394 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 395 /// 396 /// \headerfile <x86intrin.h> 397 /// 398 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 399 /// 400 /// \param __a 401 /// A 128-bit vector of [2 x double] containing one of the source operands. 402 /// \param __b 403 /// A 128-bit vector of [2 x double] containing one of the source operands. 404 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 405 /// values between both operands. 406 static __inline__ __m128d __DEFAULT_FN_ATTRS 407 _mm_xor_pd(__m128d __a, __m128d __b) 408 { 409 return (__m128d)((__v2du)__a ^ (__v2du)__b); 410 } 411 412 /// Compares each of the corresponding double-precision values of the 413 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 414 /// for false, 0xFFFFFFFFFFFFFFFF for true. 415 /// 416 /// \headerfile <x86intrin.h> 417 /// 418 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 419 /// 420 /// \param __a 421 /// A 128-bit vector of [2 x double]. 422 /// \param __b 423 /// A 128-bit vector of [2 x double]. 424 /// \returns A 128-bit vector containing the comparison results. 425 static __inline__ __m128d __DEFAULT_FN_ATTRS 426 _mm_cmpeq_pd(__m128d __a, __m128d __b) 427 { 428 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 429 } 430 431 /// Compares each of the corresponding double-precision values of the 432 /// 128-bit vectors of [2 x double] to determine if the values in the first 433 /// operand are less than those in the second operand. Each comparison 434 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 435 /// 436 /// \headerfile <x86intrin.h> 437 /// 438 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 439 /// 440 /// \param __a 441 /// A 128-bit vector of [2 x double]. 442 /// \param __b 443 /// A 128-bit vector of [2 x double]. 444 /// \returns A 128-bit vector containing the comparison results. 445 static __inline__ __m128d __DEFAULT_FN_ATTRS 446 _mm_cmplt_pd(__m128d __a, __m128d __b) 447 { 448 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 449 } 450 451 /// Compares each of the corresponding double-precision values of the 452 /// 128-bit vectors of [2 x double] to determine if the values in the first 453 /// operand are less than or equal to those in the second operand. 454 /// 455 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 456 /// 457 /// \headerfile <x86intrin.h> 458 /// 459 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 460 /// 461 /// \param __a 462 /// A 128-bit vector of [2 x double]. 463 /// \param __b 464 /// A 128-bit vector of [2 x double]. 465 /// \returns A 128-bit vector containing the comparison results. 466 static __inline__ __m128d __DEFAULT_FN_ATTRS 467 _mm_cmple_pd(__m128d __a, __m128d __b) 468 { 469 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 470 } 471 472 /// Compares each of the corresponding double-precision values of the 473 /// 128-bit vectors of [2 x double] to determine if the values in the first 474 /// operand are greater than those in the second operand. 475 /// 476 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 477 /// 478 /// \headerfile <x86intrin.h> 479 /// 480 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 481 /// 482 /// \param __a 483 /// A 128-bit vector of [2 x double]. 484 /// \param __b 485 /// A 128-bit vector of [2 x double]. 486 /// \returns A 128-bit vector containing the comparison results. 487 static __inline__ __m128d __DEFAULT_FN_ATTRS 488 _mm_cmpgt_pd(__m128d __a, __m128d __b) 489 { 490 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 491 } 492 493 /// Compares each of the corresponding double-precision values of the 494 /// 128-bit vectors of [2 x double] to determine if the values in the first 495 /// operand are greater than or equal to those in the second operand. 496 /// 497 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 498 /// 499 /// \headerfile <x86intrin.h> 500 /// 501 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 502 /// 503 /// \param __a 504 /// A 128-bit vector of [2 x double]. 505 /// \param __b 506 /// A 128-bit vector of [2 x double]. 507 /// \returns A 128-bit vector containing the comparison results. 508 static __inline__ __m128d __DEFAULT_FN_ATTRS 509 _mm_cmpge_pd(__m128d __a, __m128d __b) 510 { 511 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 512 } 513 514 /// Compares each of the corresponding double-precision values of the 515 /// 128-bit vectors of [2 x double] to determine if the values in the first 516 /// operand are ordered with respect to those in the second operand. 517 /// 518 /// A pair of double-precision values are "ordered" with respect to each 519 /// other if neither value is a NaN. Each comparison yields 0x0 for false, 520 /// 0xFFFFFFFFFFFFFFFF for true. 521 /// 522 /// \headerfile <x86intrin.h> 523 /// 524 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 525 /// 526 /// \param __a 527 /// A 128-bit vector of [2 x double]. 528 /// \param __b 529 /// A 128-bit vector of [2 x double]. 530 /// \returns A 128-bit vector containing the comparison results. 531 static __inline__ __m128d __DEFAULT_FN_ATTRS 532 _mm_cmpord_pd(__m128d __a, __m128d __b) 533 { 534 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 535 } 536 537 /// Compares each of the corresponding double-precision values of the 538 /// 128-bit vectors of [2 x double] to determine if the values in the first 539 /// operand are unordered with respect to those in the second operand. 540 /// 541 /// A pair of double-precision values are "unordered" with respect to each 542 /// other if one or both values are NaN. Each comparison yields 0x0 for 543 /// false, 0xFFFFFFFFFFFFFFFF for true. 544 /// 545 /// \headerfile <x86intrin.h> 546 /// 547 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 548 /// instruction. 549 /// 550 /// \param __a 551 /// A 128-bit vector of [2 x double]. 552 /// \param __b 553 /// A 128-bit vector of [2 x double]. 554 /// \returns A 128-bit vector containing the comparison results. 555 static __inline__ __m128d __DEFAULT_FN_ATTRS 556 _mm_cmpunord_pd(__m128d __a, __m128d __b) 557 { 558 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 559 } 560 561 /// Compares each of the corresponding double-precision values of the 562 /// 128-bit vectors of [2 x double] to determine if the values in the first 563 /// operand are unequal to those in the second operand. 564 /// 565 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 566 /// 567 /// \headerfile <x86intrin.h> 568 /// 569 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 570 /// 571 /// \param __a 572 /// A 128-bit vector of [2 x double]. 573 /// \param __b 574 /// A 128-bit vector of [2 x double]. 575 /// \returns A 128-bit vector containing the comparison results. 576 static __inline__ __m128d __DEFAULT_FN_ATTRS 577 _mm_cmpneq_pd(__m128d __a, __m128d __b) 578 { 579 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 580 } 581 582 /// Compares each of the corresponding double-precision values of the 583 /// 128-bit vectors of [2 x double] to determine if the values in the first 584 /// operand are not less than those in the second operand. 585 /// 586 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 587 /// 588 /// \headerfile <x86intrin.h> 589 /// 590 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 591 /// 592 /// \param __a 593 /// A 128-bit vector of [2 x double]. 594 /// \param __b 595 /// A 128-bit vector of [2 x double]. 596 /// \returns A 128-bit vector containing the comparison results. 597 static __inline__ __m128d __DEFAULT_FN_ATTRS 598 _mm_cmpnlt_pd(__m128d __a, __m128d __b) 599 { 600 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 601 } 602 603 /// Compares each of the corresponding double-precision values of the 604 /// 128-bit vectors of [2 x double] to determine if the values in the first 605 /// operand are not less than or equal to those in the second operand. 606 /// 607 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 608 /// 609 /// \headerfile <x86intrin.h> 610 /// 611 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 612 /// 613 /// \param __a 614 /// A 128-bit vector of [2 x double]. 615 /// \param __b 616 /// A 128-bit vector of [2 x double]. 617 /// \returns A 128-bit vector containing the comparison results. 618 static __inline__ __m128d __DEFAULT_FN_ATTRS 619 _mm_cmpnle_pd(__m128d __a, __m128d __b) 620 { 621 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 622 } 623 624 /// Compares each of the corresponding double-precision values of the 625 /// 128-bit vectors of [2 x double] to determine if the values in the first 626 /// operand are not greater than those in the second operand. 627 /// 628 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 629 /// 630 /// \headerfile <x86intrin.h> 631 /// 632 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 633 /// 634 /// \param __a 635 /// A 128-bit vector of [2 x double]. 636 /// \param __b 637 /// A 128-bit vector of [2 x double]. 638 /// \returns A 128-bit vector containing the comparison results. 639 static __inline__ __m128d __DEFAULT_FN_ATTRS 640 _mm_cmpngt_pd(__m128d __a, __m128d __b) 641 { 642 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 643 } 644 645 /// Compares each of the corresponding double-precision values of the 646 /// 128-bit vectors of [2 x double] to determine if the values in the first 647 /// operand are not greater than or equal to those in the second operand. 648 /// 649 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 650 /// 651 /// \headerfile <x86intrin.h> 652 /// 653 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 654 /// 655 /// \param __a 656 /// A 128-bit vector of [2 x double]. 657 /// \param __b 658 /// A 128-bit vector of [2 x double]. 659 /// \returns A 128-bit vector containing the comparison results. 660 static __inline__ __m128d __DEFAULT_FN_ATTRS 661 _mm_cmpnge_pd(__m128d __a, __m128d __b) 662 { 663 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 664 } 665 666 /// Compares the lower double-precision floating-point values in each of 667 /// the two 128-bit floating-point vectors of [2 x double] for equality. 668 /// 669 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 670 /// 671 /// \headerfile <x86intrin.h> 672 /// 673 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 674 /// 675 /// \param __a 676 /// A 128-bit vector of [2 x double]. The lower double-precision value is 677 /// compared to the lower double-precision value of \a __b. 678 /// \param __b 679 /// A 128-bit vector of [2 x double]. The lower double-precision value is 680 /// compared to the lower double-precision value of \a __a. 681 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 682 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 683 static __inline__ __m128d __DEFAULT_FN_ATTRS 684 _mm_cmpeq_sd(__m128d __a, __m128d __b) 685 { 686 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 687 } 688 689 /// Compares the lower double-precision floating-point values in each of 690 /// the two 128-bit floating-point vectors of [2 x double] to determine if 691 /// the value in the first parameter is less than the corresponding value in 692 /// the second parameter. 693 /// 694 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 695 /// 696 /// \headerfile <x86intrin.h> 697 /// 698 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 699 /// 700 /// \param __a 701 /// A 128-bit vector of [2 x double]. The lower double-precision value is 702 /// compared to the lower double-precision value of \a __b. 703 /// \param __b 704 /// A 128-bit vector of [2 x double]. The lower double-precision value is 705 /// compared to the lower double-precision value of \a __a. 706 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 707 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 708 static __inline__ __m128d __DEFAULT_FN_ATTRS 709 _mm_cmplt_sd(__m128d __a, __m128d __b) 710 { 711 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 712 } 713 714 /// Compares the lower double-precision floating-point values in each of 715 /// the two 128-bit floating-point vectors of [2 x double] to determine if 716 /// the value in the first parameter is less than or equal to the 717 /// corresponding value in the second parameter. 718 /// 719 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 720 /// 721 /// \headerfile <x86intrin.h> 722 /// 723 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 724 /// 725 /// \param __a 726 /// A 128-bit vector of [2 x double]. The lower double-precision value is 727 /// compared to the lower double-precision value of \a __b. 728 /// \param __b 729 /// A 128-bit vector of [2 x double]. The lower double-precision value is 730 /// compared to the lower double-precision value of \a __a. 731 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 732 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 733 static __inline__ __m128d __DEFAULT_FN_ATTRS 734 _mm_cmple_sd(__m128d __a, __m128d __b) 735 { 736 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 737 } 738 739 /// Compares the lower double-precision floating-point values in each of 740 /// the two 128-bit floating-point vectors of [2 x double] to determine if 741 /// the value in the first parameter is greater than the corresponding value 742 /// in the second parameter. 743 /// 744 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 745 /// 746 /// \headerfile <x86intrin.h> 747 /// 748 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 749 /// 750 /// \param __a 751 /// A 128-bit vector of [2 x double]. The lower double-precision value is 752 /// compared to the lower double-precision value of \a __b. 753 /// \param __b 754 /// A 128-bit vector of [2 x double]. The lower double-precision value is 755 /// compared to the lower double-precision value of \a __a. 756 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 757 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 758 static __inline__ __m128d __DEFAULT_FN_ATTRS 759 _mm_cmpgt_sd(__m128d __a, __m128d __b) 760 { 761 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 762 return __extension__ (__m128d) { __c[0], __a[1] }; 763 } 764 765 /// Compares the lower double-precision floating-point values in each of 766 /// the two 128-bit floating-point vectors of [2 x double] to determine if 767 /// the value in the first parameter is greater than or equal to the 768 /// corresponding value in the second parameter. 769 /// 770 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 771 /// 772 /// \headerfile <x86intrin.h> 773 /// 774 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 775 /// 776 /// \param __a 777 /// A 128-bit vector of [2 x double]. The lower double-precision value is 778 /// compared to the lower double-precision value of \a __b. 779 /// \param __b 780 /// A 128-bit vector of [2 x double]. The lower double-precision value is 781 /// compared to the lower double-precision value of \a __a. 782 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 783 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 784 static __inline__ __m128d __DEFAULT_FN_ATTRS 785 _mm_cmpge_sd(__m128d __a, __m128d __b) 786 { 787 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 788 return __extension__ (__m128d) { __c[0], __a[1] }; 789 } 790 791 /// Compares the lower double-precision floating-point values in each of 792 /// the two 128-bit floating-point vectors of [2 x double] to determine if 793 /// the value in the first parameter is "ordered" with respect to the 794 /// corresponding value in the second parameter. 795 /// 796 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 797 /// of double-precision values are "ordered" with respect to each other if 798 /// neither value is a NaN. 799 /// 800 /// \headerfile <x86intrin.h> 801 /// 802 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 803 /// 804 /// \param __a 805 /// A 128-bit vector of [2 x double]. The lower double-precision value is 806 /// compared to the lower double-precision value of \a __b. 807 /// \param __b 808 /// A 128-bit vector of [2 x double]. The lower double-precision value is 809 /// compared to the lower double-precision value of \a __a. 810 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 811 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 812 static __inline__ __m128d __DEFAULT_FN_ATTRS 813 _mm_cmpord_sd(__m128d __a, __m128d __b) 814 { 815 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 816 } 817 818 /// Compares the lower double-precision floating-point values in each of 819 /// the two 128-bit floating-point vectors of [2 x double] to determine if 820 /// the value in the first parameter is "unordered" with respect to the 821 /// corresponding value in the second parameter. 822 /// 823 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 824 /// of double-precision values are "unordered" with respect to each other if 825 /// one or both values are NaN. 826 /// 827 /// \headerfile <x86intrin.h> 828 /// 829 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 830 /// instruction. 831 /// 832 /// \param __a 833 /// A 128-bit vector of [2 x double]. The lower double-precision value is 834 /// compared to the lower double-precision value of \a __b. 835 /// \param __b 836 /// A 128-bit vector of [2 x double]. The lower double-precision value is 837 /// compared to the lower double-precision value of \a __a. 838 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 839 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 840 static __inline__ __m128d __DEFAULT_FN_ATTRS 841 _mm_cmpunord_sd(__m128d __a, __m128d __b) 842 { 843 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 844 } 845 846 /// Compares the lower double-precision floating-point values in each of 847 /// the two 128-bit floating-point vectors of [2 x double] to determine if 848 /// the value in the first parameter is unequal to the corresponding value in 849 /// the second parameter. 850 /// 851 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 852 /// 853 /// \headerfile <x86intrin.h> 854 /// 855 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 856 /// 857 /// \param __a 858 /// A 128-bit vector of [2 x double]. The lower double-precision value is 859 /// compared to the lower double-precision value of \a __b. 860 /// \param __b 861 /// A 128-bit vector of [2 x double]. The lower double-precision value is 862 /// compared to the lower double-precision value of \a __a. 863 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 864 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 865 static __inline__ __m128d __DEFAULT_FN_ATTRS 866 _mm_cmpneq_sd(__m128d __a, __m128d __b) 867 { 868 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 869 } 870 871 /// Compares the lower double-precision floating-point values in each of 872 /// the two 128-bit floating-point vectors of [2 x double] to determine if 873 /// the value in the first parameter is not less than the corresponding 874 /// value in the second parameter. 875 /// 876 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 877 /// 878 /// \headerfile <x86intrin.h> 879 /// 880 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 881 /// 882 /// \param __a 883 /// A 128-bit vector of [2 x double]. The lower double-precision value is 884 /// compared to the lower double-precision value of \a __b. 885 /// \param __b 886 /// A 128-bit vector of [2 x double]. The lower double-precision value is 887 /// compared to the lower double-precision value of \a __a. 888 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 889 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 890 static __inline__ __m128d __DEFAULT_FN_ATTRS 891 _mm_cmpnlt_sd(__m128d __a, __m128d __b) 892 { 893 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 894 } 895 896 /// Compares the lower double-precision floating-point values in each of 897 /// the two 128-bit floating-point vectors of [2 x double] to determine if 898 /// the value in the first parameter is not less than or equal to the 899 /// corresponding value in the second parameter. 900 /// 901 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 902 /// 903 /// \headerfile <x86intrin.h> 904 /// 905 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 906 /// 907 /// \param __a 908 /// A 128-bit vector of [2 x double]. The lower double-precision value is 909 /// compared to the lower double-precision value of \a __b. 910 /// \param __b 911 /// A 128-bit vector of [2 x double]. The lower double-precision value is 912 /// compared to the lower double-precision value of \a __a. 913 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 914 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 915 static __inline__ __m128d __DEFAULT_FN_ATTRS 916 _mm_cmpnle_sd(__m128d __a, __m128d __b) 917 { 918 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 919 } 920 921 /// Compares the lower double-precision floating-point values in each of 922 /// the two 128-bit floating-point vectors of [2 x double] to determine if 923 /// the value in the first parameter is not greater than the corresponding 924 /// value in the second parameter. 925 /// 926 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 927 /// 928 /// \headerfile <x86intrin.h> 929 /// 930 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 931 /// 932 /// \param __a 933 /// A 128-bit vector of [2 x double]. The lower double-precision value is 934 /// compared to the lower double-precision value of \a __b. 935 /// \param __b 936 /// A 128-bit vector of [2 x double]. The lower double-precision value is 937 /// compared to the lower double-precision value of \a __a. 938 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 939 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 940 static __inline__ __m128d __DEFAULT_FN_ATTRS 941 _mm_cmpngt_sd(__m128d __a, __m128d __b) 942 { 943 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 944 return __extension__ (__m128d) { __c[0], __a[1] }; 945 } 946 947 /// Compares the lower double-precision floating-point values in each of 948 /// the two 128-bit floating-point vectors of [2 x double] to determine if 949 /// the value in the first parameter is not greater than or equal to the 950 /// corresponding value in the second parameter. 951 /// 952 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 953 /// 954 /// \headerfile <x86intrin.h> 955 /// 956 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 957 /// 958 /// \param __a 959 /// A 128-bit vector of [2 x double]. The lower double-precision value is 960 /// compared to the lower double-precision value of \a __b. 961 /// \param __b 962 /// A 128-bit vector of [2 x double]. The lower double-precision value is 963 /// compared to the lower double-precision value of \a __a. 964 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 965 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 966 static __inline__ __m128d __DEFAULT_FN_ATTRS 967 _mm_cmpnge_sd(__m128d __a, __m128d __b) 968 { 969 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 970 return __extension__ (__m128d) { __c[0], __a[1] }; 971 } 972 973 /// Compares the lower double-precision floating-point values in each of 974 /// the two 128-bit floating-point vectors of [2 x double] for equality. 975 /// 976 /// The comparison yields 0 for false, 1 for true. If either of the two 977 /// lower double-precision values is NaN, 0 is returned. 978 /// 979 /// \headerfile <x86intrin.h> 980 /// 981 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 982 /// 983 /// \param __a 984 /// A 128-bit vector of [2 x double]. The lower double-precision value is 985 /// compared to the lower double-precision value of \a __b. 986 /// \param __b 987 /// A 128-bit vector of [2 x double]. The lower double-precision value is 988 /// compared to the lower double-precision value of \a __a. 989 /// \returns An integer containing the comparison results. If either of the two 990 /// lower double-precision values is NaN, 0 is returned. 991 static __inline__ int __DEFAULT_FN_ATTRS 992 _mm_comieq_sd(__m128d __a, __m128d __b) 993 { 994 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 995 } 996 997 /// Compares the lower double-precision floating-point values in each of 998 /// the two 128-bit floating-point vectors of [2 x double] to determine if 999 /// the value in the first parameter is less than the corresponding value in 1000 /// the second parameter. 1001 /// 1002 /// The comparison yields 0 for false, 1 for true. If either of the two 1003 /// lower double-precision values is NaN, 0 is returned. 1004 /// 1005 /// \headerfile <x86intrin.h> 1006 /// 1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1008 /// 1009 /// \param __a 1010 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1011 /// compared to the lower double-precision value of \a __b. 1012 /// \param __b 1013 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1014 /// compared to the lower double-precision value of \a __a. 1015 /// \returns An integer containing the comparison results. If either of the two 1016 /// lower double-precision values is NaN, 0 is returned. 1017 static __inline__ int __DEFAULT_FN_ATTRS 1018 _mm_comilt_sd(__m128d __a, __m128d __b) 1019 { 1020 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1021 } 1022 1023 /// Compares the lower double-precision floating-point values in each of 1024 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1025 /// the value in the first parameter is less than or equal to the 1026 /// corresponding value in the second parameter. 1027 /// 1028 /// The comparison yields 0 for false, 1 for true. If either of the two 1029 /// lower double-precision values is NaN, 0 is returned. 1030 /// 1031 /// \headerfile <x86intrin.h> 1032 /// 1033 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1034 /// 1035 /// \param __a 1036 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1037 /// compared to the lower double-precision value of \a __b. 1038 /// \param __b 1039 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1040 /// compared to the lower double-precision value of \a __a. 1041 /// \returns An integer containing the comparison results. If either of the two 1042 /// lower double-precision values is NaN, 0 is returned. 1043 static __inline__ int __DEFAULT_FN_ATTRS 1044 _mm_comile_sd(__m128d __a, __m128d __b) 1045 { 1046 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1047 } 1048 1049 /// Compares the lower double-precision floating-point values in each of 1050 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1051 /// the value in the first parameter is greater than the corresponding value 1052 /// in the second parameter. 1053 /// 1054 /// The comparison yields 0 for false, 1 for true. If either of the two 1055 /// lower double-precision values is NaN, 0 is returned. 1056 /// 1057 /// \headerfile <x86intrin.h> 1058 /// 1059 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1060 /// 1061 /// \param __a 1062 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1063 /// compared to the lower double-precision value of \a __b. 1064 /// \param __b 1065 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1066 /// compared to the lower double-precision value of \a __a. 1067 /// \returns An integer containing the comparison results. If either of the two 1068 /// lower double-precision values is NaN, 0 is returned. 1069 static __inline__ int __DEFAULT_FN_ATTRS 1070 _mm_comigt_sd(__m128d __a, __m128d __b) 1071 { 1072 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1073 } 1074 1075 /// Compares the lower double-precision floating-point values in each of 1076 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1077 /// the value in the first parameter is greater than or equal to the 1078 /// corresponding value in the second parameter. 1079 /// 1080 /// The comparison yields 0 for false, 1 for true. If either of the two 1081 /// lower double-precision values is NaN, 0 is returned. 1082 /// 1083 /// \headerfile <x86intrin.h> 1084 /// 1085 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1086 /// 1087 /// \param __a 1088 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1089 /// compared to the lower double-precision value of \a __b. 1090 /// \param __b 1091 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1092 /// compared to the lower double-precision value of \a __a. 1093 /// \returns An integer containing the comparison results. If either of the two 1094 /// lower double-precision values is NaN, 0 is returned. 1095 static __inline__ int __DEFAULT_FN_ATTRS 1096 _mm_comige_sd(__m128d __a, __m128d __b) 1097 { 1098 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1099 } 1100 1101 /// Compares the lower double-precision floating-point values in each of 1102 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1103 /// the value in the first parameter is unequal to the corresponding value in 1104 /// the second parameter. 1105 /// 1106 /// The comparison yields 0 for false, 1 for true. If either of the two 1107 /// lower double-precision values is NaN, 1 is returned. 1108 /// 1109 /// \headerfile <x86intrin.h> 1110 /// 1111 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1112 /// 1113 /// \param __a 1114 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1115 /// compared to the lower double-precision value of \a __b. 1116 /// \param __b 1117 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1118 /// compared to the lower double-precision value of \a __a. 1119 /// \returns An integer containing the comparison results. If either of the two 1120 /// lower double-precision values is NaN, 1 is returned. 1121 static __inline__ int __DEFAULT_FN_ATTRS 1122 _mm_comineq_sd(__m128d __a, __m128d __b) 1123 { 1124 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1125 } 1126 1127 /// Compares the lower double-precision floating-point values in each of 1128 /// the two 128-bit floating-point vectors of [2 x double] for equality. The 1129 /// comparison yields 0 for false, 1 for true. 1130 /// 1131 /// If either of the two lower double-precision values is NaN, 0 is returned. 1132 /// 1133 /// \headerfile <x86intrin.h> 1134 /// 1135 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1136 /// 1137 /// \param __a 1138 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1139 /// compared to the lower double-precision value of \a __b. 1140 /// \param __b 1141 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1142 /// compared to the lower double-precision value of \a __a. 1143 /// \returns An integer containing the comparison results. If either of the two 1144 /// lower double-precision values is NaN, 0 is returned. 1145 static __inline__ int __DEFAULT_FN_ATTRS 1146 _mm_ucomieq_sd(__m128d __a, __m128d __b) 1147 { 1148 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1149 } 1150 1151 /// Compares the lower double-precision floating-point values in each of 1152 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1153 /// the value in the first parameter is less than the corresponding value in 1154 /// the second parameter. 1155 /// 1156 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1157 /// double-precision values is NaN, 0 is returned. 1158 /// 1159 /// \headerfile <x86intrin.h> 1160 /// 1161 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1162 /// 1163 /// \param __a 1164 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1165 /// compared to the lower double-precision value of \a __b. 1166 /// \param __b 1167 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1168 /// compared to the lower double-precision value of \a __a. 1169 /// \returns An integer containing the comparison results. If either of the two 1170 /// lower double-precision values is NaN, 0 is returned. 1171 static __inline__ int __DEFAULT_FN_ATTRS 1172 _mm_ucomilt_sd(__m128d __a, __m128d __b) 1173 { 1174 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1175 } 1176 1177 /// Compares the lower double-precision floating-point values in each of 1178 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1179 /// the value in the first parameter is less than or equal to the 1180 /// corresponding value in the second parameter. 1181 /// 1182 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1183 /// double-precision values is NaN, 0 is returned. 1184 /// 1185 /// \headerfile <x86intrin.h> 1186 /// 1187 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1188 /// 1189 /// \param __a 1190 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1191 /// compared to the lower double-precision value of \a __b. 1192 /// \param __b 1193 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1194 /// compared to the lower double-precision value of \a __a. 1195 /// \returns An integer containing the comparison results. If either of the two 1196 /// lower double-precision values is NaN, 0 is returned. 1197 static __inline__ int __DEFAULT_FN_ATTRS 1198 _mm_ucomile_sd(__m128d __a, __m128d __b) 1199 { 1200 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1201 } 1202 1203 /// Compares the lower double-precision floating-point values in each of 1204 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1205 /// the value in the first parameter is greater than the corresponding value 1206 /// in the second parameter. 1207 /// 1208 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1209 /// double-precision values is NaN, 0 is returned. 1210 /// 1211 /// \headerfile <x86intrin.h> 1212 /// 1213 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1214 /// 1215 /// \param __a 1216 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1217 /// compared to the lower double-precision value of \a __b. 1218 /// \param __b 1219 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1220 /// compared to the lower double-precision value of \a __a. 1221 /// \returns An integer containing the comparison results. If either of the two 1222 /// lower double-precision values is NaN, 0 is returned. 1223 static __inline__ int __DEFAULT_FN_ATTRS 1224 _mm_ucomigt_sd(__m128d __a, __m128d __b) 1225 { 1226 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1227 } 1228 1229 /// Compares the lower double-precision floating-point values in each of 1230 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1231 /// the value in the first parameter is greater than or equal to the 1232 /// corresponding value in the second parameter. 1233 /// 1234 /// The comparison yields 0 for false, 1 for true. If either of the two 1235 /// lower double-precision values is NaN, 0 is returned. 1236 /// 1237 /// \headerfile <x86intrin.h> 1238 /// 1239 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1240 /// 1241 /// \param __a 1242 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1243 /// compared to the lower double-precision value of \a __b. 1244 /// \param __b 1245 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1246 /// compared to the lower double-precision value of \a __a. 1247 /// \returns An integer containing the comparison results. If either of the two 1248 /// lower double-precision values is NaN, 0 is returned. 1249 static __inline__ int __DEFAULT_FN_ATTRS 1250 _mm_ucomige_sd(__m128d __a, __m128d __b) 1251 { 1252 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1253 } 1254 1255 /// Compares the lower double-precision floating-point values in each of 1256 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1257 /// the value in the first parameter is unequal to the corresponding value in 1258 /// the second parameter. 1259 /// 1260 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1261 /// double-precision values is NaN, 1 is returned. 1262 /// 1263 /// \headerfile <x86intrin.h> 1264 /// 1265 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1266 /// 1267 /// \param __a 1268 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1269 /// compared to the lower double-precision value of \a __b. 1270 /// \param __b 1271 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1272 /// compared to the lower double-precision value of \a __a. 1273 /// \returns An integer containing the comparison result. If either of the two 1274 /// lower double-precision values is NaN, 1 is returned. 1275 static __inline__ int __DEFAULT_FN_ATTRS 1276 _mm_ucomineq_sd(__m128d __a, __m128d __b) 1277 { 1278 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1279 } 1280 1281 /// Converts the two double-precision floating-point elements of a 1282 /// 128-bit vector of [2 x double] into two single-precision floating-point 1283 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1284 /// The upper 64 bits of the result vector are set to zero. 1285 /// 1286 /// \headerfile <x86intrin.h> 1287 /// 1288 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1289 /// 1290 /// \param __a 1291 /// A 128-bit vector of [2 x double]. 1292 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1293 /// converted values. The upper 64 bits are set to zero. 1294 static __inline__ __m128 __DEFAULT_FN_ATTRS 1295 _mm_cvtpd_ps(__m128d __a) 1296 { 1297 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1298 } 1299 1300 /// Converts the lower two single-precision floating-point elements of a 1301 /// 128-bit vector of [4 x float] into two double-precision floating-point 1302 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1303 /// elements of the input vector are unused. 1304 /// 1305 /// \headerfile <x86intrin.h> 1306 /// 1307 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1308 /// 1309 /// \param __a 1310 /// A 128-bit vector of [4 x float]. The lower two single-precision 1311 /// floating-point elements are converted to double-precision values. The 1312 /// upper two elements are unused. 1313 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1314 static __inline__ __m128d __DEFAULT_FN_ATTRS 1315 _mm_cvtps_pd(__m128 __a) 1316 { 1317 return (__m128d) __builtin_convertvector( 1318 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1319 } 1320 1321 /// Converts the lower two integer elements of a 128-bit vector of 1322 /// [4 x i32] into two double-precision floating-point values, returned in a 1323 /// 128-bit vector of [2 x double]. 1324 /// 1325 /// The upper two elements of the input vector are unused. 1326 /// 1327 /// \headerfile <x86intrin.h> 1328 /// 1329 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1330 /// 1331 /// \param __a 1332 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1333 /// converted to double-precision values. 1334 /// 1335 /// The upper two elements are unused. 1336 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1337 static __inline__ __m128d __DEFAULT_FN_ATTRS 1338 _mm_cvtepi32_pd(__m128i __a) 1339 { 1340 return (__m128d) __builtin_convertvector( 1341 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1342 } 1343 1344 /// Converts the two double-precision floating-point elements of a 1345 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1346 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1347 /// 64 bits of the result vector are set to zero. 1348 /// 1349 /// \headerfile <x86intrin.h> 1350 /// 1351 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1352 /// 1353 /// \param __a 1354 /// A 128-bit vector of [2 x double]. 1355 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1356 /// converted values. The upper 64 bits are set to zero. 1357 static __inline__ __m128i __DEFAULT_FN_ATTRS 1358 _mm_cvtpd_epi32(__m128d __a) 1359 { 1360 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1361 } 1362 1363 /// Converts the low-order element of a 128-bit vector of [2 x double] 1364 /// into a 32-bit signed integer value. 1365 /// 1366 /// \headerfile <x86intrin.h> 1367 /// 1368 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1369 /// 1370 /// \param __a 1371 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1372 /// conversion. 1373 /// \returns A 32-bit signed integer containing the converted value. 1374 static __inline__ int __DEFAULT_FN_ATTRS 1375 _mm_cvtsd_si32(__m128d __a) 1376 { 1377 return __builtin_ia32_cvtsd2si((__v2df)__a); 1378 } 1379 1380 /// Converts the lower double-precision floating-point element of a 1381 /// 128-bit vector of [2 x double], in the second parameter, into a 1382 /// single-precision floating-point value, returned in the lower 32 bits of a 1383 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1384 /// copied from the upper 96 bits of the first parameter. 1385 /// 1386 /// \headerfile <x86intrin.h> 1387 /// 1388 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1389 /// 1390 /// \param __a 1391 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1392 /// copied to the upper 96 bits of the result. 1393 /// \param __b 1394 /// A 128-bit vector of [2 x double]. The lower double-precision 1395 /// floating-point element is used in the conversion. 1396 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1397 /// converted value from the second parameter. The upper 96 bits are copied 1398 /// from the upper 96 bits of the first parameter. 1399 static __inline__ __m128 __DEFAULT_FN_ATTRS 1400 _mm_cvtsd_ss(__m128 __a, __m128d __b) 1401 { 1402 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1403 } 1404 1405 /// Converts a 32-bit signed integer value, in the second parameter, into 1406 /// a double-precision floating-point value, returned in the lower 64 bits of 1407 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1408 /// are copied from the upper 64 bits of the first parameter. 1409 /// 1410 /// \headerfile <x86intrin.h> 1411 /// 1412 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1413 /// 1414 /// \param __a 1415 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1416 /// copied to the upper 64 bits of the result. 1417 /// \param __b 1418 /// A 32-bit signed integer containing the value to be converted. 1419 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1420 /// converted value from the second parameter. The upper 64 bits are copied 1421 /// from the upper 64 bits of the first parameter. 1422 static __inline__ __m128d __DEFAULT_FN_ATTRS 1423 _mm_cvtsi32_sd(__m128d __a, int __b) 1424 { 1425 __a[0] = __b; 1426 return __a; 1427 } 1428 1429 /// Converts the lower single-precision floating-point element of a 1430 /// 128-bit vector of [4 x float], in the second parameter, into a 1431 /// double-precision floating-point value, returned in the lower 64 bits of 1432 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1433 /// are copied from the upper 64 bits of the first parameter. 1434 /// 1435 /// \headerfile <x86intrin.h> 1436 /// 1437 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1438 /// 1439 /// \param __a 1440 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1441 /// copied to the upper 64 bits of the result. 1442 /// \param __b 1443 /// A 128-bit vector of [4 x float]. The lower single-precision 1444 /// floating-point element is used in the conversion. 1445 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1446 /// converted value from the second parameter. The upper 64 bits are copied 1447 /// from the upper 64 bits of the first parameter. 1448 static __inline__ __m128d __DEFAULT_FN_ATTRS 1449 _mm_cvtss_sd(__m128d __a, __m128 __b) 1450 { 1451 __a[0] = __b[0]; 1452 return __a; 1453 } 1454 1455 /// Converts the two double-precision floating-point elements of a 1456 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1457 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. 1458 /// 1459 /// If the result of either conversion is inexact, the result is truncated 1460 /// (rounded towards zero) regardless of the current MXCSR setting. The upper 1461 /// 64 bits of the result vector are set to zero. 1462 /// 1463 /// \headerfile <x86intrin.h> 1464 /// 1465 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1466 /// instruction. 1467 /// 1468 /// \param __a 1469 /// A 128-bit vector of [2 x double]. 1470 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1471 /// converted values. The upper 64 bits are set to zero. 1472 static __inline__ __m128i __DEFAULT_FN_ATTRS 1473 _mm_cvttpd_epi32(__m128d __a) 1474 { 1475 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1476 } 1477 1478 /// Converts the low-order element of a [2 x double] vector into a 32-bit 1479 /// signed integer value, truncating the result when it is inexact. 1480 /// 1481 /// \headerfile <x86intrin.h> 1482 /// 1483 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1484 /// instruction. 1485 /// 1486 /// \param __a 1487 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1488 /// conversion. 1489 /// \returns A 32-bit signed integer containing the converted value. 1490 static __inline__ int __DEFAULT_FN_ATTRS 1491 _mm_cvttsd_si32(__m128d __a) 1492 { 1493 return __builtin_ia32_cvttsd2si((__v2df)__a); 1494 } 1495 1496 /// Converts the two double-precision floating-point elements of a 1497 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1498 /// returned in a 64-bit vector of [2 x i32]. 1499 /// 1500 /// \headerfile <x86intrin.h> 1501 /// 1502 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1503 /// 1504 /// \param __a 1505 /// A 128-bit vector of [2 x double]. 1506 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1507 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1508 _mm_cvtpd_pi32(__m128d __a) 1509 { 1510 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1511 } 1512 1513 /// Converts the two double-precision floating-point elements of a 1514 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1515 /// returned in a 64-bit vector of [2 x i32]. 1516 /// 1517 /// If the result of either conversion is inexact, the result is truncated 1518 /// (rounded towards zero) regardless of the current MXCSR setting. 1519 /// 1520 /// \headerfile <x86intrin.h> 1521 /// 1522 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1523 /// 1524 /// \param __a 1525 /// A 128-bit vector of [2 x double]. 1526 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1527 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 1528 _mm_cvttpd_pi32(__m128d __a) 1529 { 1530 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1531 } 1532 1533 /// Converts the two signed 32-bit integer elements of a 64-bit vector of 1534 /// [2 x i32] into two double-precision floating-point values, returned in a 1535 /// 128-bit vector of [2 x double]. 1536 /// 1537 /// \headerfile <x86intrin.h> 1538 /// 1539 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1540 /// 1541 /// \param __a 1542 /// A 64-bit vector of [2 x i32]. 1543 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1544 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX 1545 _mm_cvtpi32_pd(__m64 __a) 1546 { 1547 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1548 } 1549 1550 /// Returns the low-order element of a 128-bit vector of [2 x double] as 1551 /// a double-precision floating-point value. 1552 /// 1553 /// \headerfile <x86intrin.h> 1554 /// 1555 /// This intrinsic has no corresponding instruction. 1556 /// 1557 /// \param __a 1558 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1559 /// \returns A double-precision floating-point value copied from the lower 64 1560 /// bits of \a __a. 1561 static __inline__ double __DEFAULT_FN_ATTRS 1562 _mm_cvtsd_f64(__m128d __a) 1563 { 1564 return __a[0]; 1565 } 1566 1567 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1568 /// memory location. 1569 /// 1570 /// \headerfile <x86intrin.h> 1571 /// 1572 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1573 /// 1574 /// \param __dp 1575 /// A pointer to a 128-bit memory location. The address of the memory 1576 /// location has to be 16-byte aligned. 1577 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1578 static __inline__ __m128d __DEFAULT_FN_ATTRS 1579 _mm_load_pd(double const *__dp) 1580 { 1581 return *(__m128d*)__dp; 1582 } 1583 1584 /// Loads a double-precision floating-point value from a specified memory 1585 /// location and duplicates it to both vector elements of a 128-bit vector of 1586 /// [2 x double]. 1587 /// 1588 /// \headerfile <x86intrin.h> 1589 /// 1590 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1591 /// 1592 /// \param __dp 1593 /// A pointer to a memory location containing a double-precision value. 1594 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1595 /// duplicated values. 1596 static __inline__ __m128d __DEFAULT_FN_ATTRS 1597 _mm_load1_pd(double const *__dp) 1598 { 1599 struct __mm_load1_pd_struct { 1600 double __u; 1601 } __attribute__((__packed__, __may_alias__)); 1602 double __u = ((struct __mm_load1_pd_struct*)__dp)->__u; 1603 return __extension__ (__m128d){ __u, __u }; 1604 } 1605 1606 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1607 1608 /// Loads two double-precision values, in reverse order, from an aligned 1609 /// memory location into a 128-bit vector of [2 x double]. 1610 /// 1611 /// \headerfile <x86intrin.h> 1612 /// 1613 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1614 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1615 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1616 /// 1617 /// \param __dp 1618 /// A 16-byte aligned pointer to an array of double-precision values to be 1619 /// loaded in reverse order. 1620 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1621 /// values. 1622 static __inline__ __m128d __DEFAULT_FN_ATTRS 1623 _mm_loadr_pd(double const *__dp) 1624 { 1625 __m128d __u = *(__m128d*)__dp; 1626 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1627 } 1628 1629 /// Loads a 128-bit floating-point vector of [2 x double] from an 1630 /// unaligned memory location. 1631 /// 1632 /// \headerfile <x86intrin.h> 1633 /// 1634 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1635 /// 1636 /// \param __dp 1637 /// A pointer to a 128-bit memory location. The address of the memory 1638 /// location does not have to be aligned. 1639 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1640 static __inline__ __m128d __DEFAULT_FN_ATTRS 1641 _mm_loadu_pd(double const *__dp) 1642 { 1643 struct __loadu_pd { 1644 __m128d_u __v; 1645 } __attribute__((__packed__, __may_alias__)); 1646 return ((struct __loadu_pd*)__dp)->__v; 1647 } 1648 1649 /// Loads a 64-bit integer value to the low element of a 128-bit integer 1650 /// vector and clears the upper element. 1651 /// 1652 /// \headerfile <x86intrin.h> 1653 /// 1654 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1655 /// 1656 /// \param __a 1657 /// A pointer to a 64-bit memory location. The address of the memory 1658 /// location does not have to be aligned. 1659 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1660 static __inline__ __m128i __DEFAULT_FN_ATTRS 1661 _mm_loadu_si64(void const *__a) 1662 { 1663 struct __loadu_si64 { 1664 long long __v; 1665 } __attribute__((__packed__, __may_alias__)); 1666 long long __u = ((struct __loadu_si64*)__a)->__v; 1667 return __extension__ (__m128i)(__v2di){__u, 0LL}; 1668 } 1669 1670 /// Loads a 32-bit integer value to the low element of a 128-bit integer 1671 /// vector and clears the upper element. 1672 /// 1673 /// \headerfile <x86intrin.h> 1674 /// 1675 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1676 /// 1677 /// \param __a 1678 /// A pointer to a 32-bit memory location. The address of the memory 1679 /// location does not have to be aligned. 1680 /// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1681 static __inline__ __m128i __DEFAULT_FN_ATTRS 1682 _mm_loadu_si32(void const *__a) 1683 { 1684 struct __loadu_si32 { 1685 int __v; 1686 } __attribute__((__packed__, __may_alias__)); 1687 int __u = ((struct __loadu_si32*)__a)->__v; 1688 return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; 1689 } 1690 1691 /// Loads a 16-bit integer value to the low element of a 128-bit integer 1692 /// vector and clears the upper element. 1693 /// 1694 /// \headerfile <x86intrin.h> 1695 /// 1696 /// This intrinsic does not correspond to a specific instruction. 1697 /// 1698 /// \param __a 1699 /// A pointer to a 16-bit memory location. The address of the memory 1700 /// location does not have to be aligned. 1701 /// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1702 static __inline__ __m128i __DEFAULT_FN_ATTRS 1703 _mm_loadu_si16(void const *__a) 1704 { 1705 struct __loadu_si16 { 1706 short __v; 1707 } __attribute__((__packed__, __may_alias__)); 1708 short __u = ((struct __loadu_si16*)__a)->__v; 1709 return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1710 } 1711 1712 /// Loads a 64-bit double-precision value to the low element of a 1713 /// 128-bit integer vector and clears the upper element. 1714 /// 1715 /// \headerfile <x86intrin.h> 1716 /// 1717 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1718 /// 1719 /// \param __dp 1720 /// A pointer to a memory location containing a double-precision value. 1721 /// The address of the memory location does not have to be aligned. 1722 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1723 static __inline__ __m128d __DEFAULT_FN_ATTRS 1724 _mm_load_sd(double const *__dp) 1725 { 1726 struct __mm_load_sd_struct { 1727 double __u; 1728 } __attribute__((__packed__, __may_alias__)); 1729 double __u = ((struct __mm_load_sd_struct*)__dp)->__u; 1730 return __extension__ (__m128d){ __u, 0 }; 1731 } 1732 1733 /// Loads a double-precision value into the high-order bits of a 128-bit 1734 /// vector of [2 x double]. The low-order bits are copied from the low-order 1735 /// bits of the first operand. 1736 /// 1737 /// \headerfile <x86intrin.h> 1738 /// 1739 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1740 /// 1741 /// \param __a 1742 /// A 128-bit vector of [2 x double]. \n 1743 /// Bits [63:0] are written to bits [63:0] of the result. 1744 /// \param __dp 1745 /// A pointer to a 64-bit memory location containing a double-precision 1746 /// floating-point value that is loaded. The loaded value is written to bits 1747 /// [127:64] of the result. The address of the memory location does not have 1748 /// to be aligned. 1749 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1750 static __inline__ __m128d __DEFAULT_FN_ATTRS 1751 _mm_loadh_pd(__m128d __a, double const *__dp) 1752 { 1753 struct __mm_loadh_pd_struct { 1754 double __u; 1755 } __attribute__((__packed__, __may_alias__)); 1756 double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u; 1757 return __extension__ (__m128d){ __a[0], __u }; 1758 } 1759 1760 /// Loads a double-precision value into the low-order bits of a 128-bit 1761 /// vector of [2 x double]. The high-order bits are copied from the 1762 /// high-order bits of the first operand. 1763 /// 1764 /// \headerfile <x86intrin.h> 1765 /// 1766 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1767 /// 1768 /// \param __a 1769 /// A 128-bit vector of [2 x double]. \n 1770 /// Bits [127:64] are written to bits [127:64] of the result. 1771 /// \param __dp 1772 /// A pointer to a 64-bit memory location containing a double-precision 1773 /// floating-point value that is loaded. The loaded value is written to bits 1774 /// [63:0] of the result. The address of the memory location does not have to 1775 /// be aligned. 1776 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1777 static __inline__ __m128d __DEFAULT_FN_ATTRS 1778 _mm_loadl_pd(__m128d __a, double const *__dp) 1779 { 1780 struct __mm_loadl_pd_struct { 1781 double __u; 1782 } __attribute__((__packed__, __may_alias__)); 1783 double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u; 1784 return __extension__ (__m128d){ __u, __a[1] }; 1785 } 1786 1787 /// Constructs a 128-bit floating-point vector of [2 x double] with 1788 /// unspecified content. This could be used as an argument to another 1789 /// intrinsic function where the argument is required but the value is not 1790 /// actually used. 1791 /// 1792 /// \headerfile <x86intrin.h> 1793 /// 1794 /// This intrinsic has no corresponding instruction. 1795 /// 1796 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1797 /// content. 1798 static __inline__ __m128d __DEFAULT_FN_ATTRS 1799 _mm_undefined_pd(void) 1800 { 1801 return (__m128d)__builtin_ia32_undef128(); 1802 } 1803 1804 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1805 /// 64 bits of the vector are initialized with the specified double-precision 1806 /// floating-point value. The upper 64 bits are set to zero. 1807 /// 1808 /// \headerfile <x86intrin.h> 1809 /// 1810 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1811 /// 1812 /// \param __w 1813 /// A double-precision floating-point value used to initialize the lower 64 1814 /// bits of the result. 1815 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1816 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1817 /// set to zero. 1818 static __inline__ __m128d __DEFAULT_FN_ATTRS 1819 _mm_set_sd(double __w) 1820 { 1821 return __extension__ (__m128d){ __w, 0 }; 1822 } 1823 1824 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1825 /// of the two double-precision floating-point vector elements set to the 1826 /// specified double-precision floating-point value. 1827 /// 1828 /// \headerfile <x86intrin.h> 1829 /// 1830 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1831 /// 1832 /// \param __w 1833 /// A double-precision floating-point value used to initialize each vector 1834 /// element of the result. 1835 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1836 static __inline__ __m128d __DEFAULT_FN_ATTRS 1837 _mm_set1_pd(double __w) 1838 { 1839 return __extension__ (__m128d){ __w, __w }; 1840 } 1841 1842 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1843 /// of the two double-precision floating-point vector elements set to the 1844 /// specified double-precision floating-point value. 1845 /// 1846 /// \headerfile <x86intrin.h> 1847 /// 1848 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1849 /// 1850 /// \param __w 1851 /// A double-precision floating-point value used to initialize each vector 1852 /// element of the result. 1853 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1854 static __inline__ __m128d __DEFAULT_FN_ATTRS 1855 _mm_set_pd1(double __w) 1856 { 1857 return _mm_set1_pd(__w); 1858 } 1859 1860 /// Constructs a 128-bit floating-point vector of [2 x double] 1861 /// initialized with the specified double-precision floating-point values. 1862 /// 1863 /// \headerfile <x86intrin.h> 1864 /// 1865 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1866 /// 1867 /// \param __w 1868 /// A double-precision floating-point value used to initialize the upper 64 1869 /// bits of the result. 1870 /// \param __x 1871 /// A double-precision floating-point value used to initialize the lower 64 1872 /// bits of the result. 1873 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1874 static __inline__ __m128d __DEFAULT_FN_ATTRS 1875 _mm_set_pd(double __w, double __x) 1876 { 1877 return __extension__ (__m128d){ __x, __w }; 1878 } 1879 1880 /// Constructs a 128-bit floating-point vector of [2 x double], 1881 /// initialized in reverse order with the specified double-precision 1882 /// floating-point values. 1883 /// 1884 /// \headerfile <x86intrin.h> 1885 /// 1886 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1887 /// 1888 /// \param __w 1889 /// A double-precision floating-point value used to initialize the lower 64 1890 /// bits of the result. 1891 /// \param __x 1892 /// A double-precision floating-point value used to initialize the upper 64 1893 /// bits of the result. 1894 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1895 static __inline__ __m128d __DEFAULT_FN_ATTRS 1896 _mm_setr_pd(double __w, double __x) 1897 { 1898 return __extension__ (__m128d){ __w, __x }; 1899 } 1900 1901 /// Constructs a 128-bit floating-point vector of [2 x double] 1902 /// initialized to zero. 1903 /// 1904 /// \headerfile <x86intrin.h> 1905 /// 1906 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1907 /// 1908 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1909 /// all elements set to zero. 1910 static __inline__ __m128d __DEFAULT_FN_ATTRS 1911 _mm_setzero_pd(void) 1912 { 1913 return __extension__ (__m128d){ 0, 0 }; 1914 } 1915 1916 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1917 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1918 /// 64 bits are set to the upper 64 bits of the first parameter. 1919 /// 1920 /// \headerfile <x86intrin.h> 1921 /// 1922 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1923 /// 1924 /// \param __a 1925 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1926 /// upper 64 bits of the result. 1927 /// \param __b 1928 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1929 /// lower 64 bits of the result. 1930 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1931 static __inline__ __m128d __DEFAULT_FN_ATTRS 1932 _mm_move_sd(__m128d __a, __m128d __b) 1933 { 1934 __a[0] = __b[0]; 1935 return __a; 1936 } 1937 1938 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1939 /// memory location. 1940 /// 1941 /// \headerfile <x86intrin.h> 1942 /// 1943 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1944 /// 1945 /// \param __dp 1946 /// A pointer to a 64-bit memory location. 1947 /// \param __a 1948 /// A 128-bit vector of [2 x double] containing the value to be stored. 1949 static __inline__ void __DEFAULT_FN_ATTRS 1950 _mm_store_sd(double *__dp, __m128d __a) 1951 { 1952 struct __mm_store_sd_struct { 1953 double __u; 1954 } __attribute__((__packed__, __may_alias__)); 1955 ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; 1956 } 1957 1958 /// Moves packed double-precision values from a 128-bit vector of 1959 /// [2 x double] to a memory location. 1960 /// 1961 /// \headerfile <x86intrin.h> 1962 /// 1963 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1964 /// 1965 /// \param __dp 1966 /// A pointer to an aligned memory location that can store two 1967 /// double-precision values. 1968 /// \param __a 1969 /// A packed 128-bit vector of [2 x double] containing the values to be 1970 /// moved. 1971 static __inline__ void __DEFAULT_FN_ATTRS 1972 _mm_store_pd(double *__dp, __m128d __a) 1973 { 1974 *(__m128d*)__dp = __a; 1975 } 1976 1977 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1978 /// the upper and lower 64 bits of a memory location. 1979 /// 1980 /// \headerfile <x86intrin.h> 1981 /// 1982 /// This intrinsic corresponds to the 1983 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1984 /// 1985 /// \param __dp 1986 /// A pointer to a memory location that can store two double-precision 1987 /// values. 1988 /// \param __a 1989 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1990 /// of the values in \a __dp. 1991 static __inline__ void __DEFAULT_FN_ATTRS 1992 _mm_store1_pd(double *__dp, __m128d __a) 1993 { 1994 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1995 _mm_store_pd(__dp, __a); 1996 } 1997 1998 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1999 /// the upper and lower 64 bits of a memory location. 2000 /// 2001 /// \headerfile <x86intrin.h> 2002 /// 2003 /// This intrinsic corresponds to the 2004 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 2005 /// 2006 /// \param __dp 2007 /// A pointer to a memory location that can store two double-precision 2008 /// values. 2009 /// \param __a 2010 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 2011 /// of the values in \a __dp. 2012 static __inline__ void __DEFAULT_FN_ATTRS 2013 _mm_store_pd1(double *__dp, __m128d __a) 2014 { 2015 _mm_store1_pd(__dp, __a); 2016 } 2017 2018 /// Stores a 128-bit vector of [2 x double] into an unaligned memory 2019 /// location. 2020 /// 2021 /// \headerfile <x86intrin.h> 2022 /// 2023 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 2024 /// 2025 /// \param __dp 2026 /// A pointer to a 128-bit memory location. The address of the memory 2027 /// location does not have to be aligned. 2028 /// \param __a 2029 /// A 128-bit vector of [2 x double] containing the values to be stored. 2030 static __inline__ void __DEFAULT_FN_ATTRS 2031 _mm_storeu_pd(double *__dp, __m128d __a) 2032 { 2033 struct __storeu_pd { 2034 __m128d_u __v; 2035 } __attribute__((__packed__, __may_alias__)); 2036 ((struct __storeu_pd*)__dp)->__v = __a; 2037 } 2038 2039 /// Stores two double-precision values, in reverse order, from a 128-bit 2040 /// vector of [2 x double] to a 16-byte aligned memory location. 2041 /// 2042 /// \headerfile <x86intrin.h> 2043 /// 2044 /// This intrinsic corresponds to a shuffling instruction followed by a 2045 /// <c> VMOVAPD / MOVAPD </c> instruction. 2046 /// 2047 /// \param __dp 2048 /// A pointer to a 16-byte aligned memory location that can store two 2049 /// double-precision values. 2050 /// \param __a 2051 /// A 128-bit vector of [2 x double] containing the values to be reversed and 2052 /// stored. 2053 static __inline__ void __DEFAULT_FN_ATTRS 2054 _mm_storer_pd(double *__dp, __m128d __a) 2055 { 2056 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 2057 *(__m128d *)__dp = __a; 2058 } 2059 2060 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 2061 /// memory location. 2062 /// 2063 /// \headerfile <x86intrin.h> 2064 /// 2065 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 2066 /// 2067 /// \param __dp 2068 /// A pointer to a 64-bit memory location. 2069 /// \param __a 2070 /// A 128-bit vector of [2 x double] containing the value to be stored. 2071 static __inline__ void __DEFAULT_FN_ATTRS 2072 _mm_storeh_pd(double *__dp, __m128d __a) 2073 { 2074 struct __mm_storeh_pd_struct { 2075 double __u; 2076 } __attribute__((__packed__, __may_alias__)); 2077 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; 2078 } 2079 2080 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 2081 /// memory location. 2082 /// 2083 /// \headerfile <x86intrin.h> 2084 /// 2085 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 2086 /// 2087 /// \param __dp 2088 /// A pointer to a 64-bit memory location. 2089 /// \param __a 2090 /// A 128-bit vector of [2 x double] containing the value to be stored. 2091 static __inline__ void __DEFAULT_FN_ATTRS 2092 _mm_storel_pd(double *__dp, __m128d __a) 2093 { 2094 struct __mm_storeh_pd_struct { 2095 double __u; 2096 } __attribute__((__packed__, __may_alias__)); 2097 ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; 2098 } 2099 2100 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2101 /// saving the lower 8 bits of each sum in the corresponding element of a 2102 /// 128-bit result vector of [16 x i8]. 2103 /// 2104 /// The integer elements of both parameters can be either signed or unsigned. 2105 /// 2106 /// \headerfile <x86intrin.h> 2107 /// 2108 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2109 /// 2110 /// \param __a 2111 /// A 128-bit vector of [16 x i8]. 2112 /// \param __b 2113 /// A 128-bit vector of [16 x i8]. 2114 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2115 /// parameters. 2116 static __inline__ __m128i __DEFAULT_FN_ATTRS 2117 _mm_add_epi8(__m128i __a, __m128i __b) 2118 { 2119 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2120 } 2121 2122 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2123 /// saving the lower 16 bits of each sum in the corresponding element of a 2124 /// 128-bit result vector of [8 x i16]. 2125 /// 2126 /// The integer elements of both parameters can be either signed or unsigned. 2127 /// 2128 /// \headerfile <x86intrin.h> 2129 /// 2130 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2131 /// 2132 /// \param __a 2133 /// A 128-bit vector of [8 x i16]. 2134 /// \param __b 2135 /// A 128-bit vector of [8 x i16]. 2136 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2137 /// parameters. 2138 static __inline__ __m128i __DEFAULT_FN_ATTRS 2139 _mm_add_epi16(__m128i __a, __m128i __b) 2140 { 2141 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2142 } 2143 2144 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2145 /// saving the lower 32 bits of each sum in the corresponding element of a 2146 /// 128-bit result vector of [4 x i32]. 2147 /// 2148 /// The integer elements of both parameters can be either signed or unsigned. 2149 /// 2150 /// \headerfile <x86intrin.h> 2151 /// 2152 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2153 /// 2154 /// \param __a 2155 /// A 128-bit vector of [4 x i32]. 2156 /// \param __b 2157 /// A 128-bit vector of [4 x i32]. 2158 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2159 /// parameters. 2160 static __inline__ __m128i __DEFAULT_FN_ATTRS 2161 _mm_add_epi32(__m128i __a, __m128i __b) 2162 { 2163 return (__m128i)((__v4su)__a + (__v4su)__b); 2164 } 2165 2166 /// Adds two signed or unsigned 64-bit integer values, returning the 2167 /// lower 64 bits of the sum. 2168 /// 2169 /// \headerfile <x86intrin.h> 2170 /// 2171 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2172 /// 2173 /// \param __a 2174 /// A 64-bit integer. 2175 /// \param __b 2176 /// A 64-bit integer. 2177 /// \returns A 64-bit integer containing the sum of both parameters. 2178 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2179 _mm_add_si64(__m64 __a, __m64 __b) 2180 { 2181 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2182 } 2183 2184 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2185 /// saving the lower 64 bits of each sum in the corresponding element of a 2186 /// 128-bit result vector of [2 x i64]. 2187 /// 2188 /// The integer elements of both parameters can be either signed or unsigned. 2189 /// 2190 /// \headerfile <x86intrin.h> 2191 /// 2192 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2193 /// 2194 /// \param __a 2195 /// A 128-bit vector of [2 x i64]. 2196 /// \param __b 2197 /// A 128-bit vector of [2 x i64]. 2198 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2199 /// parameters. 2200 static __inline__ __m128i __DEFAULT_FN_ATTRS 2201 _mm_add_epi64(__m128i __a, __m128i __b) 2202 { 2203 return (__m128i)((__v2du)__a + (__v2du)__b); 2204 } 2205 2206 /// Adds, with saturation, the corresponding elements of two 128-bit 2207 /// signed [16 x i8] vectors, saving each sum in the corresponding element of 2208 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are 2209 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. 2210 /// 2211 /// \headerfile <x86intrin.h> 2212 /// 2213 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2214 /// 2215 /// \param __a 2216 /// A 128-bit signed [16 x i8] vector. 2217 /// \param __b 2218 /// A 128-bit signed [16 x i8] vector. 2219 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2220 /// both parameters. 2221 static __inline__ __m128i __DEFAULT_FN_ATTRS 2222 _mm_adds_epi8(__m128i __a, __m128i __b) 2223 { 2224 return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); 2225 } 2226 2227 /// Adds, with saturation, the corresponding elements of two 128-bit 2228 /// signed [8 x i16] vectors, saving each sum in the corresponding element of 2229 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF 2230 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to 2231 /// 0x8000. 2232 /// 2233 /// \headerfile <x86intrin.h> 2234 /// 2235 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2236 /// 2237 /// \param __a 2238 /// A 128-bit signed [8 x i16] vector. 2239 /// \param __b 2240 /// A 128-bit signed [8 x i16] vector. 2241 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2242 /// both parameters. 2243 static __inline__ __m128i __DEFAULT_FN_ATTRS 2244 _mm_adds_epi16(__m128i __a, __m128i __b) 2245 { 2246 return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); 2247 } 2248 2249 /// Adds, with saturation, the corresponding elements of two 128-bit 2250 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2251 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF 2252 /// are saturated to 0xFF. Negative sums are saturated to 0x00. 2253 /// 2254 /// \headerfile <x86intrin.h> 2255 /// 2256 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2257 /// 2258 /// \param __a 2259 /// A 128-bit unsigned [16 x i8] vector. 2260 /// \param __b 2261 /// A 128-bit unsigned [16 x i8] vector. 2262 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2263 /// of both parameters. 2264 static __inline__ __m128i __DEFAULT_FN_ATTRS 2265 _mm_adds_epu8(__m128i __a, __m128i __b) 2266 { 2267 return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); 2268 } 2269 2270 /// Adds, with saturation, the corresponding elements of two 128-bit 2271 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2272 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than 2273 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. 2274 /// 2275 /// \headerfile <x86intrin.h> 2276 /// 2277 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2278 /// 2279 /// \param __a 2280 /// A 128-bit unsigned [8 x i16] vector. 2281 /// \param __b 2282 /// A 128-bit unsigned [8 x i16] vector. 2283 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2284 /// of both parameters. 2285 static __inline__ __m128i __DEFAULT_FN_ATTRS 2286 _mm_adds_epu16(__m128i __a, __m128i __b) 2287 { 2288 return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); 2289 } 2290 2291 /// Computes the rounded avarages of corresponding elements of two 2292 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2293 /// corresponding element of a 128-bit result vector of [16 x i8]. 2294 /// 2295 /// \headerfile <x86intrin.h> 2296 /// 2297 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2298 /// 2299 /// \param __a 2300 /// A 128-bit unsigned [16 x i8] vector. 2301 /// \param __b 2302 /// A 128-bit unsigned [16 x i8] vector. 2303 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2304 /// averages of both parameters. 2305 static __inline__ __m128i __DEFAULT_FN_ATTRS 2306 _mm_avg_epu8(__m128i __a, __m128i __b) 2307 { 2308 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2309 } 2310 2311 /// Computes the rounded avarages of corresponding elements of two 2312 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2313 /// corresponding element of a 128-bit result vector of [8 x i16]. 2314 /// 2315 /// \headerfile <x86intrin.h> 2316 /// 2317 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2318 /// 2319 /// \param __a 2320 /// A 128-bit unsigned [8 x i16] vector. 2321 /// \param __b 2322 /// A 128-bit unsigned [8 x i16] vector. 2323 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2324 /// averages of both parameters. 2325 static __inline__ __m128i __DEFAULT_FN_ATTRS 2326 _mm_avg_epu16(__m128i __a, __m128i __b) 2327 { 2328 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2329 } 2330 2331 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2332 /// vectors, producing eight intermediate 32-bit signed integer products, and 2333 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2334 /// [4 x i32] vector. 2335 /// 2336 /// For example, bits [15:0] of both parameters are multiplied producing a 2337 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2338 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2339 /// of the result. 2340 /// 2341 /// \headerfile <x86intrin.h> 2342 /// 2343 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2344 /// 2345 /// \param __a 2346 /// A 128-bit signed [8 x i16] vector. 2347 /// \param __b 2348 /// A 128-bit signed [8 x i16] vector. 2349 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2350 /// of both parameters. 2351 static __inline__ __m128i __DEFAULT_FN_ATTRS 2352 _mm_madd_epi16(__m128i __a, __m128i __b) 2353 { 2354 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2355 } 2356 2357 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2358 /// vectors, saving the greater value from each comparison in the 2359 /// corresponding element of a 128-bit result vector of [8 x i16]. 2360 /// 2361 /// \headerfile <x86intrin.h> 2362 /// 2363 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2364 /// 2365 /// \param __a 2366 /// A 128-bit signed [8 x i16] vector. 2367 /// \param __b 2368 /// A 128-bit signed [8 x i16] vector. 2369 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2370 /// each comparison. 2371 static __inline__ __m128i __DEFAULT_FN_ATTRS 2372 _mm_max_epi16(__m128i __a, __m128i __b) 2373 { 2374 return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); 2375 } 2376 2377 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2378 /// vectors, saving the greater value from each comparison in the 2379 /// corresponding element of a 128-bit result vector of [16 x i8]. 2380 /// 2381 /// \headerfile <x86intrin.h> 2382 /// 2383 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2384 /// 2385 /// \param __a 2386 /// A 128-bit unsigned [16 x i8] vector. 2387 /// \param __b 2388 /// A 128-bit unsigned [16 x i8] vector. 2389 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2390 /// each comparison. 2391 static __inline__ __m128i __DEFAULT_FN_ATTRS 2392 _mm_max_epu8(__m128i __a, __m128i __b) 2393 { 2394 return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); 2395 } 2396 2397 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2398 /// vectors, saving the smaller value from each comparison in the 2399 /// corresponding element of a 128-bit result vector of [8 x i16]. 2400 /// 2401 /// \headerfile <x86intrin.h> 2402 /// 2403 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2404 /// 2405 /// \param __a 2406 /// A 128-bit signed [8 x i16] vector. 2407 /// \param __b 2408 /// A 128-bit signed [8 x i16] vector. 2409 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2410 /// each comparison. 2411 static __inline__ __m128i __DEFAULT_FN_ATTRS 2412 _mm_min_epi16(__m128i __a, __m128i __b) 2413 { 2414 return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); 2415 } 2416 2417 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2418 /// vectors, saving the smaller value from each comparison in the 2419 /// corresponding element of a 128-bit result vector of [16 x i8]. 2420 /// 2421 /// \headerfile <x86intrin.h> 2422 /// 2423 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2424 /// 2425 /// \param __a 2426 /// A 128-bit unsigned [16 x i8] vector. 2427 /// \param __b 2428 /// A 128-bit unsigned [16 x i8] vector. 2429 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2430 /// each comparison. 2431 static __inline__ __m128i __DEFAULT_FN_ATTRS 2432 _mm_min_epu8(__m128i __a, __m128i __b) 2433 { 2434 return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); 2435 } 2436 2437 /// Multiplies the corresponding elements of two signed [8 x i16] 2438 /// vectors, saving the upper 16 bits of each 32-bit product in the 2439 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2440 /// 2441 /// \headerfile <x86intrin.h> 2442 /// 2443 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2444 /// 2445 /// \param __a 2446 /// A 128-bit signed [8 x i16] vector. 2447 /// \param __b 2448 /// A 128-bit signed [8 x i16] vector. 2449 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2450 /// each of the eight 32-bit products. 2451 static __inline__ __m128i __DEFAULT_FN_ATTRS 2452 _mm_mulhi_epi16(__m128i __a, __m128i __b) 2453 { 2454 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2455 } 2456 2457 /// Multiplies the corresponding elements of two unsigned [8 x i16] 2458 /// vectors, saving the upper 16 bits of each 32-bit product in the 2459 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2460 /// 2461 /// \headerfile <x86intrin.h> 2462 /// 2463 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2464 /// 2465 /// \param __a 2466 /// A 128-bit unsigned [8 x i16] vector. 2467 /// \param __b 2468 /// A 128-bit unsigned [8 x i16] vector. 2469 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2470 /// of each of the eight 32-bit products. 2471 static __inline__ __m128i __DEFAULT_FN_ATTRS 2472 _mm_mulhi_epu16(__m128i __a, __m128i __b) 2473 { 2474 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2475 } 2476 2477 /// Multiplies the corresponding elements of two signed [8 x i16] 2478 /// vectors, saving the lower 16 bits of each 32-bit product in the 2479 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2480 /// 2481 /// \headerfile <x86intrin.h> 2482 /// 2483 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2484 /// 2485 /// \param __a 2486 /// A 128-bit signed [8 x i16] vector. 2487 /// \param __b 2488 /// A 128-bit signed [8 x i16] vector. 2489 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2490 /// each of the eight 32-bit products. 2491 static __inline__ __m128i __DEFAULT_FN_ATTRS 2492 _mm_mullo_epi16(__m128i __a, __m128i __b) 2493 { 2494 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2495 } 2496 2497 /// Multiplies 32-bit unsigned integer values contained in the lower bits 2498 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2499 /// product. 2500 /// 2501 /// \headerfile <x86intrin.h> 2502 /// 2503 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2504 /// 2505 /// \param __a 2506 /// A 64-bit integer containing one of the source operands. 2507 /// \param __b 2508 /// A 64-bit integer containing one of the source operands. 2509 /// \returns A 64-bit integer vector containing the product of both operands. 2510 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2511 _mm_mul_su32(__m64 __a, __m64 __b) 2512 { 2513 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2514 } 2515 2516 /// Multiplies 32-bit unsigned integer values contained in the lower 2517 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2518 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2519 /// 2520 /// \headerfile <x86intrin.h> 2521 /// 2522 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2523 /// 2524 /// \param __a 2525 /// A [2 x i64] vector containing one of the source operands. 2526 /// \param __b 2527 /// A [2 x i64] vector containing one of the source operands. 2528 /// \returns A [2 x i64] vector containing the product of both operands. 2529 static __inline__ __m128i __DEFAULT_FN_ATTRS 2530 _mm_mul_epu32(__m128i __a, __m128i __b) 2531 { 2532 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2533 } 2534 2535 /// Computes the absolute differences of corresponding 8-bit integer 2536 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2537 /// separately sums the second 8 absolute differences. Packs these two 2538 /// unsigned 16-bit integer sums into the upper and lower elements of a 2539 /// [2 x i64] vector. 2540 /// 2541 /// \headerfile <x86intrin.h> 2542 /// 2543 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2544 /// 2545 /// \param __a 2546 /// A 128-bit integer vector containing one of the source operands. 2547 /// \param __b 2548 /// A 128-bit integer vector containing one of the source operands. 2549 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2550 /// differences between both operands. 2551 static __inline__ __m128i __DEFAULT_FN_ATTRS 2552 _mm_sad_epu8(__m128i __a, __m128i __b) 2553 { 2554 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2555 } 2556 2557 /// Subtracts the corresponding 8-bit integer values in the operands. 2558 /// 2559 /// \headerfile <x86intrin.h> 2560 /// 2561 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2562 /// 2563 /// \param __a 2564 /// A 128-bit integer vector containing the minuends. 2565 /// \param __b 2566 /// A 128-bit integer vector containing the subtrahends. 2567 /// \returns A 128-bit integer vector containing the differences of the values 2568 /// in the operands. 2569 static __inline__ __m128i __DEFAULT_FN_ATTRS 2570 _mm_sub_epi8(__m128i __a, __m128i __b) 2571 { 2572 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2573 } 2574 2575 /// Subtracts the corresponding 16-bit integer values in the operands. 2576 /// 2577 /// \headerfile <x86intrin.h> 2578 /// 2579 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2580 /// 2581 /// \param __a 2582 /// A 128-bit integer vector containing the minuends. 2583 /// \param __b 2584 /// A 128-bit integer vector containing the subtrahends. 2585 /// \returns A 128-bit integer vector containing the differences of the values 2586 /// in the operands. 2587 static __inline__ __m128i __DEFAULT_FN_ATTRS 2588 _mm_sub_epi16(__m128i __a, __m128i __b) 2589 { 2590 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2591 } 2592 2593 /// Subtracts the corresponding 32-bit integer values in the operands. 2594 /// 2595 /// \headerfile <x86intrin.h> 2596 /// 2597 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2598 /// 2599 /// \param __a 2600 /// A 128-bit integer vector containing the minuends. 2601 /// \param __b 2602 /// A 128-bit integer vector containing the subtrahends. 2603 /// \returns A 128-bit integer vector containing the differences of the values 2604 /// in the operands. 2605 static __inline__ __m128i __DEFAULT_FN_ATTRS 2606 _mm_sub_epi32(__m128i __a, __m128i __b) 2607 { 2608 return (__m128i)((__v4su)__a - (__v4su)__b); 2609 } 2610 2611 /// Subtracts signed or unsigned 64-bit integer values and writes the 2612 /// difference to the corresponding bits in the destination. 2613 /// 2614 /// \headerfile <x86intrin.h> 2615 /// 2616 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2617 /// 2618 /// \param __a 2619 /// A 64-bit integer vector containing the minuend. 2620 /// \param __b 2621 /// A 64-bit integer vector containing the subtrahend. 2622 /// \returns A 64-bit integer vector containing the difference of the values in 2623 /// the operands. 2624 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX 2625 _mm_sub_si64(__m64 __a, __m64 __b) 2626 { 2627 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2628 } 2629 2630 /// Subtracts the corresponding elements of two [2 x i64] vectors. 2631 /// 2632 /// \headerfile <x86intrin.h> 2633 /// 2634 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2635 /// 2636 /// \param __a 2637 /// A 128-bit integer vector containing the minuends. 2638 /// \param __b 2639 /// A 128-bit integer vector containing the subtrahends. 2640 /// \returns A 128-bit integer vector containing the differences of the values 2641 /// in the operands. 2642 static __inline__ __m128i __DEFAULT_FN_ATTRS 2643 _mm_sub_epi64(__m128i __a, __m128i __b) 2644 { 2645 return (__m128i)((__v2du)__a - (__v2du)__b); 2646 } 2647 2648 /// Subtracts corresponding 8-bit signed integer values in the input and 2649 /// returns the differences in the corresponding bytes in the destination. 2650 /// Differences greater than 0x7F are saturated to 0x7F, and differences less 2651 /// than 0x80 are saturated to 0x80. 2652 /// 2653 /// \headerfile <x86intrin.h> 2654 /// 2655 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2656 /// 2657 /// \param __a 2658 /// A 128-bit integer vector containing the minuends. 2659 /// \param __b 2660 /// A 128-bit integer vector containing the subtrahends. 2661 /// \returns A 128-bit integer vector containing the differences of the values 2662 /// in the operands. 2663 static __inline__ __m128i __DEFAULT_FN_ATTRS 2664 _mm_subs_epi8(__m128i __a, __m128i __b) 2665 { 2666 return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); 2667 } 2668 2669 /// Subtracts corresponding 16-bit signed integer values in the input and 2670 /// returns the differences in the corresponding bytes in the destination. 2671 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2672 /// than 0x8000 are saturated to 0x8000. 2673 /// 2674 /// \headerfile <x86intrin.h> 2675 /// 2676 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2677 /// 2678 /// \param __a 2679 /// A 128-bit integer vector containing the minuends. 2680 /// \param __b 2681 /// A 128-bit integer vector containing the subtrahends. 2682 /// \returns A 128-bit integer vector containing the differences of the values 2683 /// in the operands. 2684 static __inline__ __m128i __DEFAULT_FN_ATTRS 2685 _mm_subs_epi16(__m128i __a, __m128i __b) 2686 { 2687 return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); 2688 } 2689 2690 /// Subtracts corresponding 8-bit unsigned integer values in the input 2691 /// and returns the differences in the corresponding bytes in the 2692 /// destination. Differences less than 0x00 are saturated to 0x00. 2693 /// 2694 /// \headerfile <x86intrin.h> 2695 /// 2696 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2697 /// 2698 /// \param __a 2699 /// A 128-bit integer vector containing the minuends. 2700 /// \param __b 2701 /// A 128-bit integer vector containing the subtrahends. 2702 /// \returns A 128-bit integer vector containing the unsigned integer 2703 /// differences of the values in the operands. 2704 static __inline__ __m128i __DEFAULT_FN_ATTRS 2705 _mm_subs_epu8(__m128i __a, __m128i __b) 2706 { 2707 return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); 2708 } 2709 2710 /// Subtracts corresponding 16-bit unsigned integer values in the input 2711 /// and returns the differences in the corresponding bytes in the 2712 /// destination. Differences less than 0x0000 are saturated to 0x0000. 2713 /// 2714 /// \headerfile <x86intrin.h> 2715 /// 2716 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2717 /// 2718 /// \param __a 2719 /// A 128-bit integer vector containing the minuends. 2720 /// \param __b 2721 /// A 128-bit integer vector containing the subtrahends. 2722 /// \returns A 128-bit integer vector containing the unsigned integer 2723 /// differences of the values in the operands. 2724 static __inline__ __m128i __DEFAULT_FN_ATTRS 2725 _mm_subs_epu16(__m128i __a, __m128i __b) 2726 { 2727 return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); 2728 } 2729 2730 /// Performs a bitwise AND of two 128-bit integer vectors. 2731 /// 2732 /// \headerfile <x86intrin.h> 2733 /// 2734 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2735 /// 2736 /// \param __a 2737 /// A 128-bit integer vector containing one of the source operands. 2738 /// \param __b 2739 /// A 128-bit integer vector containing one of the source operands. 2740 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2741 /// in both operands. 2742 static __inline__ __m128i __DEFAULT_FN_ATTRS 2743 _mm_and_si128(__m128i __a, __m128i __b) 2744 { 2745 return (__m128i)((__v2du)__a & (__v2du)__b); 2746 } 2747 2748 /// Performs a bitwise AND of two 128-bit integer vectors, using the 2749 /// one's complement of the values contained in the first source operand. 2750 /// 2751 /// \headerfile <x86intrin.h> 2752 /// 2753 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2754 /// 2755 /// \param __a 2756 /// A 128-bit vector containing the left source operand. The one's complement 2757 /// of this value is used in the bitwise AND. 2758 /// \param __b 2759 /// A 128-bit vector containing the right source operand. 2760 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2761 /// complement of the first operand and the values in the second operand. 2762 static __inline__ __m128i __DEFAULT_FN_ATTRS 2763 _mm_andnot_si128(__m128i __a, __m128i __b) 2764 { 2765 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2766 } 2767 /// Performs a bitwise OR of two 128-bit integer vectors. 2768 /// 2769 /// \headerfile <x86intrin.h> 2770 /// 2771 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2772 /// 2773 /// \param __a 2774 /// A 128-bit integer vector containing one of the source operands. 2775 /// \param __b 2776 /// A 128-bit integer vector containing one of the source operands. 2777 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2778 /// in both operands. 2779 static __inline__ __m128i __DEFAULT_FN_ATTRS 2780 _mm_or_si128(__m128i __a, __m128i __b) 2781 { 2782 return (__m128i)((__v2du)__a | (__v2du)__b); 2783 } 2784 2785 /// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2786 /// 2787 /// \headerfile <x86intrin.h> 2788 /// 2789 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2790 /// 2791 /// \param __a 2792 /// A 128-bit integer vector containing one of the source operands. 2793 /// \param __b 2794 /// A 128-bit integer vector containing one of the source operands. 2795 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2796 /// values in both operands. 2797 static __inline__ __m128i __DEFAULT_FN_ATTRS 2798 _mm_xor_si128(__m128i __a, __m128i __b) 2799 { 2800 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2801 } 2802 2803 /// Left-shifts the 128-bit integer vector operand by the specified 2804 /// number of bytes. Low-order bits are cleared. 2805 /// 2806 /// \headerfile <x86intrin.h> 2807 /// 2808 /// \code 2809 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2810 /// \endcode 2811 /// 2812 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2813 /// 2814 /// \param a 2815 /// A 128-bit integer vector containing the source operand. 2816 /// \param imm 2817 /// An immediate value specifying the number of bytes to left-shift operand 2818 /// \a a. 2819 /// \returns A 128-bit integer vector containing the left-shifted value. 2820 #define _mm_slli_si128(a, imm) \ 2821 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 2822 2823 #define _mm_bslli_si128(a, imm) \ 2824 (__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 2825 2826 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2827 /// by the specified number of bits. Low-order bits are cleared. 2828 /// 2829 /// \headerfile <x86intrin.h> 2830 /// 2831 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2832 /// 2833 /// \param __a 2834 /// A 128-bit integer vector containing the source operand. 2835 /// \param __count 2836 /// An integer value specifying the number of bits to left-shift each value 2837 /// in operand \a __a. 2838 /// \returns A 128-bit integer vector containing the left-shifted values. 2839 static __inline__ __m128i __DEFAULT_FN_ATTRS 2840 _mm_slli_epi16(__m128i __a, int __count) 2841 { 2842 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2843 } 2844 2845 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2846 /// by the specified number of bits. Low-order bits are cleared. 2847 /// 2848 /// \headerfile <x86intrin.h> 2849 /// 2850 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2851 /// 2852 /// \param __a 2853 /// A 128-bit integer vector containing the source operand. 2854 /// \param __count 2855 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2856 /// to left-shift each value in operand \a __a. 2857 /// \returns A 128-bit integer vector containing the left-shifted values. 2858 static __inline__ __m128i __DEFAULT_FN_ATTRS 2859 _mm_sll_epi16(__m128i __a, __m128i __count) 2860 { 2861 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2862 } 2863 2864 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2865 /// by the specified number of bits. Low-order bits are cleared. 2866 /// 2867 /// \headerfile <x86intrin.h> 2868 /// 2869 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2870 /// 2871 /// \param __a 2872 /// A 128-bit integer vector containing the source operand. 2873 /// \param __count 2874 /// An integer value specifying the number of bits to left-shift each value 2875 /// in operand \a __a. 2876 /// \returns A 128-bit integer vector containing the left-shifted values. 2877 static __inline__ __m128i __DEFAULT_FN_ATTRS 2878 _mm_slli_epi32(__m128i __a, int __count) 2879 { 2880 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2881 } 2882 2883 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2884 /// by the specified number of bits. Low-order bits are cleared. 2885 /// 2886 /// \headerfile <x86intrin.h> 2887 /// 2888 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2889 /// 2890 /// \param __a 2891 /// A 128-bit integer vector containing the source operand. 2892 /// \param __count 2893 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2894 /// to left-shift each value in operand \a __a. 2895 /// \returns A 128-bit integer vector containing the left-shifted values. 2896 static __inline__ __m128i __DEFAULT_FN_ATTRS 2897 _mm_sll_epi32(__m128i __a, __m128i __count) 2898 { 2899 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2900 } 2901 2902 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2903 /// by the specified number of bits. Low-order bits are cleared. 2904 /// 2905 /// \headerfile <x86intrin.h> 2906 /// 2907 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2908 /// 2909 /// \param __a 2910 /// A 128-bit integer vector containing the source operand. 2911 /// \param __count 2912 /// An integer value specifying the number of bits to left-shift each value 2913 /// in operand \a __a. 2914 /// \returns A 128-bit integer vector containing the left-shifted values. 2915 static __inline__ __m128i __DEFAULT_FN_ATTRS 2916 _mm_slli_epi64(__m128i __a, int __count) 2917 { 2918 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2919 } 2920 2921 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2922 /// by the specified number of bits. Low-order bits are cleared. 2923 /// 2924 /// \headerfile <x86intrin.h> 2925 /// 2926 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2927 /// 2928 /// \param __a 2929 /// A 128-bit integer vector containing the source operand. 2930 /// \param __count 2931 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2932 /// to left-shift each value in operand \a __a. 2933 /// \returns A 128-bit integer vector containing the left-shifted values. 2934 static __inline__ __m128i __DEFAULT_FN_ATTRS 2935 _mm_sll_epi64(__m128i __a, __m128i __count) 2936 { 2937 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2938 } 2939 2940 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2941 /// by the specified number of bits. High-order bits are filled with the sign 2942 /// bit of the initial value. 2943 /// 2944 /// \headerfile <x86intrin.h> 2945 /// 2946 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2947 /// 2948 /// \param __a 2949 /// A 128-bit integer vector containing the source operand. 2950 /// \param __count 2951 /// An integer value specifying the number of bits to right-shift each value 2952 /// in operand \a __a. 2953 /// \returns A 128-bit integer vector containing the right-shifted values. 2954 static __inline__ __m128i __DEFAULT_FN_ATTRS 2955 _mm_srai_epi16(__m128i __a, int __count) 2956 { 2957 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2958 } 2959 2960 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2961 /// by the specified number of bits. High-order bits are filled with the sign 2962 /// bit of the initial value. 2963 /// 2964 /// \headerfile <x86intrin.h> 2965 /// 2966 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2967 /// 2968 /// \param __a 2969 /// A 128-bit integer vector containing the source operand. 2970 /// \param __count 2971 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2972 /// to right-shift each value in operand \a __a. 2973 /// \returns A 128-bit integer vector containing the right-shifted values. 2974 static __inline__ __m128i __DEFAULT_FN_ATTRS 2975 _mm_sra_epi16(__m128i __a, __m128i __count) 2976 { 2977 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2978 } 2979 2980 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2981 /// by the specified number of bits. High-order bits are filled with the sign 2982 /// bit of the initial value. 2983 /// 2984 /// \headerfile <x86intrin.h> 2985 /// 2986 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2987 /// 2988 /// \param __a 2989 /// A 128-bit integer vector containing the source operand. 2990 /// \param __count 2991 /// An integer value specifying the number of bits to right-shift each value 2992 /// in operand \a __a. 2993 /// \returns A 128-bit integer vector containing the right-shifted values. 2994 static __inline__ __m128i __DEFAULT_FN_ATTRS 2995 _mm_srai_epi32(__m128i __a, int __count) 2996 { 2997 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2998 } 2999 3000 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 3001 /// by the specified number of bits. High-order bits are filled with the sign 3002 /// bit of the initial value. 3003 /// 3004 /// \headerfile <x86intrin.h> 3005 /// 3006 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 3007 /// 3008 /// \param __a 3009 /// A 128-bit integer vector containing the source operand. 3010 /// \param __count 3011 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3012 /// to right-shift each value in operand \a __a. 3013 /// \returns A 128-bit integer vector containing the right-shifted values. 3014 static __inline__ __m128i __DEFAULT_FN_ATTRS 3015 _mm_sra_epi32(__m128i __a, __m128i __count) 3016 { 3017 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 3018 } 3019 3020 /// Right-shifts the 128-bit integer vector operand by the specified 3021 /// number of bytes. High-order bits are cleared. 3022 /// 3023 /// \headerfile <x86intrin.h> 3024 /// 3025 /// \code 3026 /// __m128i _mm_srli_si128(__m128i a, const int imm); 3027 /// \endcode 3028 /// 3029 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 3030 /// 3031 /// \param a 3032 /// A 128-bit integer vector containing the source operand. 3033 /// \param imm 3034 /// An immediate value specifying the number of bytes to right-shift operand 3035 /// \a a. 3036 /// \returns A 128-bit integer vector containing the right-shifted value. 3037 #define _mm_srli_si128(a, imm) \ 3038 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 3039 3040 #define _mm_bsrli_si128(a, imm) \ 3041 (__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)) 3042 3043 /// Right-shifts each of 16-bit values in the 128-bit integer vector 3044 /// operand by the specified number of bits. High-order bits are cleared. 3045 /// 3046 /// \headerfile <x86intrin.h> 3047 /// 3048 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3049 /// 3050 /// \param __a 3051 /// A 128-bit integer vector containing the source operand. 3052 /// \param __count 3053 /// An integer value specifying the number of bits to right-shift each value 3054 /// in operand \a __a. 3055 /// \returns A 128-bit integer vector containing the right-shifted values. 3056 static __inline__ __m128i __DEFAULT_FN_ATTRS 3057 _mm_srli_epi16(__m128i __a, int __count) 3058 { 3059 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 3060 } 3061 3062 /// Right-shifts each of 16-bit values in the 128-bit integer vector 3063 /// operand by the specified number of bits. High-order bits are cleared. 3064 /// 3065 /// \headerfile <x86intrin.h> 3066 /// 3067 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 3068 /// 3069 /// \param __a 3070 /// A 128-bit integer vector containing the source operand. 3071 /// \param __count 3072 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3073 /// to right-shift each value in operand \a __a. 3074 /// \returns A 128-bit integer vector containing the right-shifted values. 3075 static __inline__ __m128i __DEFAULT_FN_ATTRS 3076 _mm_srl_epi16(__m128i __a, __m128i __count) 3077 { 3078 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 3079 } 3080 3081 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3082 /// operand by the specified number of bits. High-order bits are cleared. 3083 /// 3084 /// \headerfile <x86intrin.h> 3085 /// 3086 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3087 /// 3088 /// \param __a 3089 /// A 128-bit integer vector containing the source operand. 3090 /// \param __count 3091 /// An integer value specifying the number of bits to right-shift each value 3092 /// in operand \a __a. 3093 /// \returns A 128-bit integer vector containing the right-shifted values. 3094 static __inline__ __m128i __DEFAULT_FN_ATTRS 3095 _mm_srli_epi32(__m128i __a, int __count) 3096 { 3097 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 3098 } 3099 3100 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3101 /// operand by the specified number of bits. High-order bits are cleared. 3102 /// 3103 /// \headerfile <x86intrin.h> 3104 /// 3105 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3106 /// 3107 /// \param __a 3108 /// A 128-bit integer vector containing the source operand. 3109 /// \param __count 3110 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3111 /// to right-shift each value in operand \a __a. 3112 /// \returns A 128-bit integer vector containing the right-shifted values. 3113 static __inline__ __m128i __DEFAULT_FN_ATTRS 3114 _mm_srl_epi32(__m128i __a, __m128i __count) 3115 { 3116 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3117 } 3118 3119 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3120 /// operand by the specified number of bits. High-order bits are cleared. 3121 /// 3122 /// \headerfile <x86intrin.h> 3123 /// 3124 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3125 /// 3126 /// \param __a 3127 /// A 128-bit integer vector containing the source operand. 3128 /// \param __count 3129 /// An integer value specifying the number of bits to right-shift each value 3130 /// in operand \a __a. 3131 /// \returns A 128-bit integer vector containing the right-shifted values. 3132 static __inline__ __m128i __DEFAULT_FN_ATTRS 3133 _mm_srli_epi64(__m128i __a, int __count) 3134 { 3135 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3136 } 3137 3138 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3139 /// operand by the specified number of bits. High-order bits are cleared. 3140 /// 3141 /// \headerfile <x86intrin.h> 3142 /// 3143 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3144 /// 3145 /// \param __a 3146 /// A 128-bit integer vector containing the source operand. 3147 /// \param __count 3148 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3149 /// to right-shift each value in operand \a __a. 3150 /// \returns A 128-bit integer vector containing the right-shifted values. 3151 static __inline__ __m128i __DEFAULT_FN_ATTRS 3152 _mm_srl_epi64(__m128i __a, __m128i __count) 3153 { 3154 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3155 } 3156 3157 /// Compares each of the corresponding 8-bit values of the 128-bit 3158 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF 3159 /// for true. 3160 /// 3161 /// \headerfile <x86intrin.h> 3162 /// 3163 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3164 /// 3165 /// \param __a 3166 /// A 128-bit integer vector. 3167 /// \param __b 3168 /// A 128-bit integer vector. 3169 /// \returns A 128-bit integer vector containing the comparison results. 3170 static __inline__ __m128i __DEFAULT_FN_ATTRS 3171 _mm_cmpeq_epi8(__m128i __a, __m128i __b) 3172 { 3173 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3174 } 3175 3176 /// Compares each of the corresponding 16-bit values of the 128-bit 3177 /// integer vectors for equality. Each comparison yields 0x0 for false, 3178 /// 0xFFFF for true. 3179 /// 3180 /// \headerfile <x86intrin.h> 3181 /// 3182 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3183 /// 3184 /// \param __a 3185 /// A 128-bit integer vector. 3186 /// \param __b 3187 /// A 128-bit integer vector. 3188 /// \returns A 128-bit integer vector containing the comparison results. 3189 static __inline__ __m128i __DEFAULT_FN_ATTRS 3190 _mm_cmpeq_epi16(__m128i __a, __m128i __b) 3191 { 3192 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3193 } 3194 3195 /// Compares each of the corresponding 32-bit values of the 128-bit 3196 /// integer vectors for equality. Each comparison yields 0x0 for false, 3197 /// 0xFFFFFFFF for true. 3198 /// 3199 /// \headerfile <x86intrin.h> 3200 /// 3201 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3202 /// 3203 /// \param __a 3204 /// A 128-bit integer vector. 3205 /// \param __b 3206 /// A 128-bit integer vector. 3207 /// \returns A 128-bit integer vector containing the comparison results. 3208 static __inline__ __m128i __DEFAULT_FN_ATTRS 3209 _mm_cmpeq_epi32(__m128i __a, __m128i __b) 3210 { 3211 return (__m128i)((__v4si)__a == (__v4si)__b); 3212 } 3213 3214 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3215 /// integer vectors to determine if the values in the first operand are 3216 /// greater than those in the second operand. Each comparison yields 0x0 for 3217 /// false, 0xFF for true. 3218 /// 3219 /// \headerfile <x86intrin.h> 3220 /// 3221 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3222 /// 3223 /// \param __a 3224 /// A 128-bit integer vector. 3225 /// \param __b 3226 /// A 128-bit integer vector. 3227 /// \returns A 128-bit integer vector containing the comparison results. 3228 static __inline__ __m128i __DEFAULT_FN_ATTRS 3229 _mm_cmpgt_epi8(__m128i __a, __m128i __b) 3230 { 3231 /* This function always performs a signed comparison, but __v16qi is a char 3232 which may be signed or unsigned, so use __v16qs. */ 3233 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3234 } 3235 3236 /// Compares each of the corresponding signed 16-bit values of the 3237 /// 128-bit integer vectors to determine if the values in the first operand 3238 /// are greater than those in the second operand. 3239 /// 3240 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3241 /// 3242 /// \headerfile <x86intrin.h> 3243 /// 3244 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3245 /// 3246 /// \param __a 3247 /// A 128-bit integer vector. 3248 /// \param __b 3249 /// A 128-bit integer vector. 3250 /// \returns A 128-bit integer vector containing the comparison results. 3251 static __inline__ __m128i __DEFAULT_FN_ATTRS 3252 _mm_cmpgt_epi16(__m128i __a, __m128i __b) 3253 { 3254 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3255 } 3256 3257 /// Compares each of the corresponding signed 32-bit values of the 3258 /// 128-bit integer vectors to determine if the values in the first operand 3259 /// are greater than those in the second operand. 3260 /// 3261 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3262 /// 3263 /// \headerfile <x86intrin.h> 3264 /// 3265 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3266 /// 3267 /// \param __a 3268 /// A 128-bit integer vector. 3269 /// \param __b 3270 /// A 128-bit integer vector. 3271 /// \returns A 128-bit integer vector containing the comparison results. 3272 static __inline__ __m128i __DEFAULT_FN_ATTRS 3273 _mm_cmpgt_epi32(__m128i __a, __m128i __b) 3274 { 3275 return (__m128i)((__v4si)__a > (__v4si)__b); 3276 } 3277 3278 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3279 /// integer vectors to determine if the values in the first operand are less 3280 /// than those in the second operand. 3281 /// 3282 /// Each comparison yields 0x0 for false, 0xFF for true. 3283 /// 3284 /// \headerfile <x86intrin.h> 3285 /// 3286 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3287 /// 3288 /// \param __a 3289 /// A 128-bit integer vector. 3290 /// \param __b 3291 /// A 128-bit integer vector. 3292 /// \returns A 128-bit integer vector containing the comparison results. 3293 static __inline__ __m128i __DEFAULT_FN_ATTRS 3294 _mm_cmplt_epi8(__m128i __a, __m128i __b) 3295 { 3296 return _mm_cmpgt_epi8(__b, __a); 3297 } 3298 3299 /// Compares each of the corresponding signed 16-bit values of the 3300 /// 128-bit integer vectors to determine if the values in the first operand 3301 /// are less than those in the second operand. 3302 /// 3303 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3304 /// 3305 /// \headerfile <x86intrin.h> 3306 /// 3307 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3308 /// 3309 /// \param __a 3310 /// A 128-bit integer vector. 3311 /// \param __b 3312 /// A 128-bit integer vector. 3313 /// \returns A 128-bit integer vector containing the comparison results. 3314 static __inline__ __m128i __DEFAULT_FN_ATTRS 3315 _mm_cmplt_epi16(__m128i __a, __m128i __b) 3316 { 3317 return _mm_cmpgt_epi16(__b, __a); 3318 } 3319 3320 /// Compares each of the corresponding signed 32-bit values of the 3321 /// 128-bit integer vectors to determine if the values in the first operand 3322 /// are less than those in the second operand. 3323 /// 3324 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3325 /// 3326 /// \headerfile <x86intrin.h> 3327 /// 3328 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3329 /// 3330 /// \param __a 3331 /// A 128-bit integer vector. 3332 /// \param __b 3333 /// A 128-bit integer vector. 3334 /// \returns A 128-bit integer vector containing the comparison results. 3335 static __inline__ __m128i __DEFAULT_FN_ATTRS 3336 _mm_cmplt_epi32(__m128i __a, __m128i __b) 3337 { 3338 return _mm_cmpgt_epi32(__b, __a); 3339 } 3340 3341 #ifdef __x86_64__ 3342 /// Converts a 64-bit signed integer value from the second operand into a 3343 /// double-precision value and returns it in the lower element of a [2 x 3344 /// double] vector; the upper element of the returned vector is copied from 3345 /// the upper element of the first operand. 3346 /// 3347 /// \headerfile <x86intrin.h> 3348 /// 3349 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3350 /// 3351 /// \param __a 3352 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3353 /// copied to the upper 64 bits of the destination. 3354 /// \param __b 3355 /// A 64-bit signed integer operand containing the value to be converted. 3356 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3357 /// converted value of the second operand. The upper 64 bits are copied from 3358 /// the upper 64 bits of the first operand. 3359 static __inline__ __m128d __DEFAULT_FN_ATTRS 3360 _mm_cvtsi64_sd(__m128d __a, long long __b) 3361 { 3362 __a[0] = __b; 3363 return __a; 3364 } 3365 3366 /// Converts the first (lower) element of a vector of [2 x double] into a 3367 /// 64-bit signed integer value, according to the current rounding mode. 3368 /// 3369 /// \headerfile <x86intrin.h> 3370 /// 3371 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3372 /// 3373 /// \param __a 3374 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3375 /// conversion. 3376 /// \returns A 64-bit signed integer containing the converted value. 3377 static __inline__ long long __DEFAULT_FN_ATTRS 3378 _mm_cvtsd_si64(__m128d __a) 3379 { 3380 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3381 } 3382 3383 /// Converts the first (lower) element of a vector of [2 x double] into a 3384 /// 64-bit signed integer value, truncating the result when it is inexact. 3385 /// 3386 /// \headerfile <x86intrin.h> 3387 /// 3388 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3389 /// instruction. 3390 /// 3391 /// \param __a 3392 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3393 /// conversion. 3394 /// \returns A 64-bit signed integer containing the converted value. 3395 static __inline__ long long __DEFAULT_FN_ATTRS 3396 _mm_cvttsd_si64(__m128d __a) 3397 { 3398 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3399 } 3400 #endif 3401 3402 /// Converts a vector of [4 x i32] into a vector of [4 x float]. 3403 /// 3404 /// \headerfile <x86intrin.h> 3405 /// 3406 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3407 /// 3408 /// \param __a 3409 /// A 128-bit integer vector. 3410 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3411 static __inline__ __m128 __DEFAULT_FN_ATTRS 3412 _mm_cvtepi32_ps(__m128i __a) 3413 { 3414 return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); 3415 } 3416 3417 /// Converts a vector of [4 x float] into a vector of [4 x i32]. 3418 /// 3419 /// \headerfile <x86intrin.h> 3420 /// 3421 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3422 /// 3423 /// \param __a 3424 /// A 128-bit vector of [4 x float]. 3425 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3426 /// values. 3427 static __inline__ __m128i __DEFAULT_FN_ATTRS 3428 _mm_cvtps_epi32(__m128 __a) 3429 { 3430 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3431 } 3432 3433 /// Converts a vector of [4 x float] into a vector of [4 x i32], 3434 /// truncating the result when it is inexact. 3435 /// 3436 /// \headerfile <x86intrin.h> 3437 /// 3438 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3439 /// instruction. 3440 /// 3441 /// \param __a 3442 /// A 128-bit vector of [4 x float]. 3443 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3444 static __inline__ __m128i __DEFAULT_FN_ATTRS 3445 _mm_cvttps_epi32(__m128 __a) 3446 { 3447 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3448 } 3449 3450 /// Returns a vector of [4 x i32] where the lowest element is the input 3451 /// operand and the remaining elements are zero. 3452 /// 3453 /// \headerfile <x86intrin.h> 3454 /// 3455 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3456 /// 3457 /// \param __a 3458 /// A 32-bit signed integer operand. 3459 /// \returns A 128-bit vector of [4 x i32]. 3460 static __inline__ __m128i __DEFAULT_FN_ATTRS 3461 _mm_cvtsi32_si128(int __a) 3462 { 3463 return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 }; 3464 } 3465 3466 #ifdef __x86_64__ 3467 /// Returns a vector of [2 x i64] where the lower element is the input 3468 /// operand and the upper element is zero. 3469 /// 3470 /// \headerfile <x86intrin.h> 3471 /// 3472 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3473 /// 3474 /// \param __a 3475 /// A 64-bit signed integer operand containing the value to be converted. 3476 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3477 static __inline__ __m128i __DEFAULT_FN_ATTRS 3478 _mm_cvtsi64_si128(long long __a) 3479 { 3480 return __extension__ (__m128i)(__v2di){ __a, 0 }; 3481 } 3482 #endif 3483 3484 /// Moves the least significant 32 bits of a vector of [4 x i32] to a 3485 /// 32-bit signed integer value. 3486 /// 3487 /// \headerfile <x86intrin.h> 3488 /// 3489 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3490 /// 3491 /// \param __a 3492 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3493 /// destination. 3494 /// \returns A 32-bit signed integer containing the moved value. 3495 static __inline__ int __DEFAULT_FN_ATTRS 3496 _mm_cvtsi128_si32(__m128i __a) 3497 { 3498 __v4si __b = (__v4si)__a; 3499 return __b[0]; 3500 } 3501 3502 #ifdef __x86_64__ 3503 /// Moves the least significant 64 bits of a vector of [2 x i64] to a 3504 /// 64-bit signed integer value. 3505 /// 3506 /// \headerfile <x86intrin.h> 3507 /// 3508 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3509 /// 3510 /// \param __a 3511 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3512 /// destination. 3513 /// \returns A 64-bit signed integer containing the moved value. 3514 static __inline__ long long __DEFAULT_FN_ATTRS 3515 _mm_cvtsi128_si64(__m128i __a) 3516 { 3517 return __a[0]; 3518 } 3519 #endif 3520 3521 /// Moves packed integer values from an aligned 128-bit memory location 3522 /// to elements in a 128-bit integer vector. 3523 /// 3524 /// \headerfile <x86intrin.h> 3525 /// 3526 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3527 /// 3528 /// \param __p 3529 /// An aligned pointer to a memory location containing integer values. 3530 /// \returns A 128-bit integer vector containing the moved values. 3531 static __inline__ __m128i __DEFAULT_FN_ATTRS 3532 _mm_load_si128(__m128i const *__p) 3533 { 3534 return *__p; 3535 } 3536 3537 /// Moves packed integer values from an unaligned 128-bit memory location 3538 /// to elements in a 128-bit integer vector. 3539 /// 3540 /// \headerfile <x86intrin.h> 3541 /// 3542 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3543 /// 3544 /// \param __p 3545 /// A pointer to a memory location containing integer values. 3546 /// \returns A 128-bit integer vector containing the moved values. 3547 static __inline__ __m128i __DEFAULT_FN_ATTRS 3548 _mm_loadu_si128(__m128i_u const *__p) 3549 { 3550 struct __loadu_si128 { 3551 __m128i_u __v; 3552 } __attribute__((__packed__, __may_alias__)); 3553 return ((struct __loadu_si128*)__p)->__v; 3554 } 3555 3556 /// Returns a vector of [2 x i64] where the lower element is taken from 3557 /// the lower element of the operand, and the upper element is zero. 3558 /// 3559 /// \headerfile <x86intrin.h> 3560 /// 3561 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3562 /// 3563 /// \param __p 3564 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3565 /// the destination. 3566 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3567 /// moved value. The higher order bits are cleared. 3568 static __inline__ __m128i __DEFAULT_FN_ATTRS 3569 _mm_loadl_epi64(__m128i_u const *__p) 3570 { 3571 struct __mm_loadl_epi64_struct { 3572 long long __u; 3573 } __attribute__((__packed__, __may_alias__)); 3574 return __extension__ (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0}; 3575 } 3576 3577 /// Generates a 128-bit vector of [4 x i32] with unspecified content. 3578 /// This could be used as an argument to another intrinsic function where the 3579 /// argument is required but the value is not actually used. 3580 /// 3581 /// \headerfile <x86intrin.h> 3582 /// 3583 /// This intrinsic has no corresponding instruction. 3584 /// 3585 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3586 static __inline__ __m128i __DEFAULT_FN_ATTRS 3587 _mm_undefined_si128(void) 3588 { 3589 return (__m128i)__builtin_ia32_undef128(); 3590 } 3591 3592 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3593 /// the specified 64-bit integer values. 3594 /// 3595 /// \headerfile <x86intrin.h> 3596 /// 3597 /// This intrinsic is a utility function and does not correspond to a specific 3598 /// instruction. 3599 /// 3600 /// \param __q1 3601 /// A 64-bit integer value used to initialize the upper 64 bits of the 3602 /// destination vector of [2 x i64]. 3603 /// \param __q0 3604 /// A 64-bit integer value used to initialize the lower 64 bits of the 3605 /// destination vector of [2 x i64]. 3606 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3607 /// provided in the operands. 3608 static __inline__ __m128i __DEFAULT_FN_ATTRS 3609 _mm_set_epi64x(long long __q1, long long __q0) 3610 { 3611 return __extension__ (__m128i)(__v2di){ __q0, __q1 }; 3612 } 3613 3614 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3615 /// the specified 64-bit integer values. 3616 /// 3617 /// \headerfile <x86intrin.h> 3618 /// 3619 /// This intrinsic is a utility function and does not correspond to a specific 3620 /// instruction. 3621 /// 3622 /// \param __q1 3623 /// A 64-bit integer value used to initialize the upper 64 bits of the 3624 /// destination vector of [2 x i64]. 3625 /// \param __q0 3626 /// A 64-bit integer value used to initialize the lower 64 bits of the 3627 /// destination vector of [2 x i64]. 3628 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3629 /// provided in the operands. 3630 static __inline__ __m128i __DEFAULT_FN_ATTRS 3631 _mm_set_epi64(__m64 __q1, __m64 __q0) 3632 { 3633 return _mm_set_epi64x((long long)__q1, (long long)__q0); 3634 } 3635 3636 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3637 /// the specified 32-bit integer values. 3638 /// 3639 /// \headerfile <x86intrin.h> 3640 /// 3641 /// This intrinsic is a utility function and does not correspond to a specific 3642 /// instruction. 3643 /// 3644 /// \param __i3 3645 /// A 32-bit integer value used to initialize bits [127:96] of the 3646 /// destination vector. 3647 /// \param __i2 3648 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3649 /// vector. 3650 /// \param __i1 3651 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3652 /// vector. 3653 /// \param __i0 3654 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3655 /// vector. 3656 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3657 /// provided in the operands. 3658 static __inline__ __m128i __DEFAULT_FN_ATTRS 3659 _mm_set_epi32(int __i3, int __i2, int __i1, int __i0) 3660 { 3661 return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; 3662 } 3663 3664 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3665 /// the specified 16-bit integer values. 3666 /// 3667 /// \headerfile <x86intrin.h> 3668 /// 3669 /// This intrinsic is a utility function and does not correspond to a specific 3670 /// instruction. 3671 /// 3672 /// \param __w7 3673 /// A 16-bit integer value used to initialize bits [127:112] of the 3674 /// destination vector. 3675 /// \param __w6 3676 /// A 16-bit integer value used to initialize bits [111:96] of the 3677 /// destination vector. 3678 /// \param __w5 3679 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3680 /// vector. 3681 /// \param __w4 3682 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3683 /// vector. 3684 /// \param __w3 3685 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3686 /// vector. 3687 /// \param __w2 3688 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3689 /// vector. 3690 /// \param __w1 3691 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3692 /// vector. 3693 /// \param __w0 3694 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3695 /// vector. 3696 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3697 /// provided in the operands. 3698 static __inline__ __m128i __DEFAULT_FN_ATTRS 3699 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) 3700 { 3701 return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; 3702 } 3703 3704 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3705 /// the specified 8-bit integer values. 3706 /// 3707 /// \headerfile <x86intrin.h> 3708 /// 3709 /// This intrinsic is a utility function and does not correspond to a specific 3710 /// instruction. 3711 /// 3712 /// \param __b15 3713 /// Initializes bits [127:120] of the destination vector. 3714 /// \param __b14 3715 /// Initializes bits [119:112] of the destination vector. 3716 /// \param __b13 3717 /// Initializes bits [111:104] of the destination vector. 3718 /// \param __b12 3719 /// Initializes bits [103:96] of the destination vector. 3720 /// \param __b11 3721 /// Initializes bits [95:88] of the destination vector. 3722 /// \param __b10 3723 /// Initializes bits [87:80] of the destination vector. 3724 /// \param __b9 3725 /// Initializes bits [79:72] of the destination vector. 3726 /// \param __b8 3727 /// Initializes bits [71:64] of the destination vector. 3728 /// \param __b7 3729 /// Initializes bits [63:56] of the destination vector. 3730 /// \param __b6 3731 /// Initializes bits [55:48] of the destination vector. 3732 /// \param __b5 3733 /// Initializes bits [47:40] of the destination vector. 3734 /// \param __b4 3735 /// Initializes bits [39:32] of the destination vector. 3736 /// \param __b3 3737 /// Initializes bits [31:24] of the destination vector. 3738 /// \param __b2 3739 /// Initializes bits [23:16] of the destination vector. 3740 /// \param __b1 3741 /// Initializes bits [15:8] of the destination vector. 3742 /// \param __b0 3743 /// Initializes bits [7:0] of the destination vector. 3744 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3745 /// provided in the operands. 3746 static __inline__ __m128i __DEFAULT_FN_ATTRS 3747 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) 3748 { 3749 return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; 3750 } 3751 3752 /// Initializes both values in a 128-bit integer vector with the 3753 /// specified 64-bit integer value. 3754 /// 3755 /// \headerfile <x86intrin.h> 3756 /// 3757 /// This intrinsic is a utility function and does not correspond to a specific 3758 /// instruction. 3759 /// 3760 /// \param __q 3761 /// Integer value used to initialize the elements of the destination integer 3762 /// vector. 3763 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3764 /// elements containing the value provided in the operand. 3765 static __inline__ __m128i __DEFAULT_FN_ATTRS 3766 _mm_set1_epi64x(long long __q) 3767 { 3768 return _mm_set_epi64x(__q, __q); 3769 } 3770 3771 /// Initializes both values in a 128-bit vector of [2 x i64] with the 3772 /// specified 64-bit value. 3773 /// 3774 /// \headerfile <x86intrin.h> 3775 /// 3776 /// This intrinsic is a utility function and does not correspond to a specific 3777 /// instruction. 3778 /// 3779 /// \param __q 3780 /// A 64-bit value used to initialize the elements of the destination integer 3781 /// vector. 3782 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3783 /// containing the value provided in the operand. 3784 static __inline__ __m128i __DEFAULT_FN_ATTRS 3785 _mm_set1_epi64(__m64 __q) 3786 { 3787 return _mm_set_epi64(__q, __q); 3788 } 3789 3790 /// Initializes all values in a 128-bit vector of [4 x i32] with the 3791 /// specified 32-bit value. 3792 /// 3793 /// \headerfile <x86intrin.h> 3794 /// 3795 /// This intrinsic is a utility function and does not correspond to a specific 3796 /// instruction. 3797 /// 3798 /// \param __i 3799 /// A 32-bit value used to initialize the elements of the destination integer 3800 /// vector. 3801 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3802 /// containing the value provided in the operand. 3803 static __inline__ __m128i __DEFAULT_FN_ATTRS 3804 _mm_set1_epi32(int __i) 3805 { 3806 return _mm_set_epi32(__i, __i, __i, __i); 3807 } 3808 3809 /// Initializes all values in a 128-bit vector of [8 x i16] with the 3810 /// specified 16-bit value. 3811 /// 3812 /// \headerfile <x86intrin.h> 3813 /// 3814 /// This intrinsic is a utility function and does not correspond to a specific 3815 /// instruction. 3816 /// 3817 /// \param __w 3818 /// A 16-bit value used to initialize the elements of the destination integer 3819 /// vector. 3820 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3821 /// containing the value provided in the operand. 3822 static __inline__ __m128i __DEFAULT_FN_ATTRS 3823 _mm_set1_epi16(short __w) 3824 { 3825 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3826 } 3827 3828 /// Initializes all values in a 128-bit vector of [16 x i8] with the 3829 /// specified 8-bit value. 3830 /// 3831 /// \headerfile <x86intrin.h> 3832 /// 3833 /// This intrinsic is a utility function and does not correspond to a specific 3834 /// instruction. 3835 /// 3836 /// \param __b 3837 /// An 8-bit value used to initialize the elements of the destination integer 3838 /// vector. 3839 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3840 /// containing the value provided in the operand. 3841 static __inline__ __m128i __DEFAULT_FN_ATTRS 3842 _mm_set1_epi8(char __b) 3843 { 3844 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); 3845 } 3846 3847 /// Constructs a 128-bit integer vector, initialized in reverse order 3848 /// with the specified 64-bit integral values. 3849 /// 3850 /// \headerfile <x86intrin.h> 3851 /// 3852 /// This intrinsic does not correspond to a specific instruction. 3853 /// 3854 /// \param __q0 3855 /// A 64-bit integral value used to initialize the lower 64 bits of the 3856 /// result. 3857 /// \param __q1 3858 /// A 64-bit integral value used to initialize the upper 64 bits of the 3859 /// result. 3860 /// \returns An initialized 128-bit integer vector. 3861 static __inline__ __m128i __DEFAULT_FN_ATTRS 3862 _mm_setr_epi64(__m64 __q0, __m64 __q1) 3863 { 3864 return _mm_set_epi64(__q1, __q0); 3865 } 3866 3867 /// Constructs a 128-bit integer vector, initialized in reverse order 3868 /// with the specified 32-bit integral values. 3869 /// 3870 /// \headerfile <x86intrin.h> 3871 /// 3872 /// This intrinsic is a utility function and does not correspond to a specific 3873 /// instruction. 3874 /// 3875 /// \param __i0 3876 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3877 /// \param __i1 3878 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3879 /// \param __i2 3880 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3881 /// \param __i3 3882 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3883 /// \returns An initialized 128-bit integer vector. 3884 static __inline__ __m128i __DEFAULT_FN_ATTRS 3885 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) 3886 { 3887 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3888 } 3889 3890 /// Constructs a 128-bit integer vector, initialized in reverse order 3891 /// with the specified 16-bit integral values. 3892 /// 3893 /// \headerfile <x86intrin.h> 3894 /// 3895 /// This intrinsic is a utility function and does not correspond to a specific 3896 /// instruction. 3897 /// 3898 /// \param __w0 3899 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3900 /// \param __w1 3901 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3902 /// \param __w2 3903 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3904 /// \param __w3 3905 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3906 /// \param __w4 3907 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3908 /// \param __w5 3909 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3910 /// \param __w6 3911 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3912 /// \param __w7 3913 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3914 /// \returns An initialized 128-bit integer vector. 3915 static __inline__ __m128i __DEFAULT_FN_ATTRS 3916 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) 3917 { 3918 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3919 } 3920 3921 /// Constructs a 128-bit integer vector, initialized in reverse order 3922 /// with the specified 8-bit integral values. 3923 /// 3924 /// \headerfile <x86intrin.h> 3925 /// 3926 /// This intrinsic is a utility function and does not correspond to a specific 3927 /// instruction. 3928 /// 3929 /// \param __b0 3930 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3931 /// \param __b1 3932 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3933 /// \param __b2 3934 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3935 /// \param __b3 3936 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3937 /// \param __b4 3938 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3939 /// \param __b5 3940 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3941 /// \param __b6 3942 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3943 /// \param __b7 3944 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3945 /// \param __b8 3946 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3947 /// \param __b9 3948 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3949 /// \param __b10 3950 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3951 /// \param __b11 3952 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3953 /// \param __b12 3954 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3955 /// \param __b13 3956 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3957 /// \param __b14 3958 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3959 /// \param __b15 3960 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3961 /// \returns An initialized 128-bit integer vector. 3962 static __inline__ __m128i __DEFAULT_FN_ATTRS 3963 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) 3964 { 3965 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3966 } 3967 3968 /// Creates a 128-bit integer vector initialized to zero. 3969 /// 3970 /// \headerfile <x86intrin.h> 3971 /// 3972 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3973 /// 3974 /// \returns An initialized 128-bit integer vector with all elements set to 3975 /// zero. 3976 static __inline__ __m128i __DEFAULT_FN_ATTRS 3977 _mm_setzero_si128(void) 3978 { 3979 return __extension__ (__m128i)(__v2di){ 0LL, 0LL }; 3980 } 3981 3982 /// Stores a 128-bit integer vector to a memory location aligned on a 3983 /// 128-bit boundary. 3984 /// 3985 /// \headerfile <x86intrin.h> 3986 /// 3987 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3988 /// 3989 /// \param __p 3990 /// A pointer to an aligned memory location that will receive the integer 3991 /// values. 3992 /// \param __b 3993 /// A 128-bit integer vector containing the values to be moved. 3994 static __inline__ void __DEFAULT_FN_ATTRS 3995 _mm_store_si128(__m128i *__p, __m128i __b) 3996 { 3997 *__p = __b; 3998 } 3999 4000 /// Stores a 128-bit integer vector to an unaligned memory location. 4001 /// 4002 /// \headerfile <x86intrin.h> 4003 /// 4004 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 4005 /// 4006 /// \param __p 4007 /// A pointer to a memory location that will receive the integer values. 4008 /// \param __b 4009 /// A 128-bit integer vector containing the values to be moved. 4010 static __inline__ void __DEFAULT_FN_ATTRS 4011 _mm_storeu_si128(__m128i_u *__p, __m128i __b) 4012 { 4013 struct __storeu_si128 { 4014 __m128i_u __v; 4015 } __attribute__((__packed__, __may_alias__)); 4016 ((struct __storeu_si128*)__p)->__v = __b; 4017 } 4018 4019 /// Stores a 64-bit integer value from the low element of a 128-bit integer 4020 /// vector. 4021 /// 4022 /// \headerfile <x86intrin.h> 4023 /// 4024 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4025 /// 4026 /// \param __p 4027 /// A pointer to a 64-bit memory location. The address of the memory 4028 /// location does not have to be algned. 4029 /// \param __b 4030 /// A 128-bit integer vector containing the value to be stored. 4031 static __inline__ void __DEFAULT_FN_ATTRS 4032 _mm_storeu_si64(void *__p, __m128i __b) 4033 { 4034 struct __storeu_si64 { 4035 long long __v; 4036 } __attribute__((__packed__, __may_alias__)); 4037 ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; 4038 } 4039 4040 /// Stores a 32-bit integer value from the low element of a 128-bit integer 4041 /// vector. 4042 /// 4043 /// \headerfile <x86intrin.h> 4044 /// 4045 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 4046 /// 4047 /// \param __p 4048 /// A pointer to a 32-bit memory location. The address of the memory 4049 /// location does not have to be aligned. 4050 /// \param __b 4051 /// A 128-bit integer vector containing the value to be stored. 4052 static __inline__ void __DEFAULT_FN_ATTRS 4053 _mm_storeu_si32(void *__p, __m128i __b) 4054 { 4055 struct __storeu_si32 { 4056 int __v; 4057 } __attribute__((__packed__, __may_alias__)); 4058 ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; 4059 } 4060 4061 /// Stores a 16-bit integer value from the low element of a 128-bit integer 4062 /// vector. 4063 /// 4064 /// \headerfile <x86intrin.h> 4065 /// 4066 /// This intrinsic does not correspond to a specific instruction. 4067 /// 4068 /// \param __p 4069 /// A pointer to a 16-bit memory location. The address of the memory 4070 /// location does not have to be aligned. 4071 /// \param __b 4072 /// A 128-bit integer vector containing the value to be stored. 4073 static __inline__ void __DEFAULT_FN_ATTRS 4074 _mm_storeu_si16(void *__p, __m128i __b) 4075 { 4076 struct __storeu_si16 { 4077 short __v; 4078 } __attribute__((__packed__, __may_alias__)); 4079 ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; 4080 } 4081 4082 /// Moves bytes selected by the mask from the first operand to the 4083 /// specified unaligned memory location. When a mask bit is 1, the 4084 /// corresponding byte is written, otherwise it is not written. 4085 /// 4086 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4087 /// used again soon). Exception and trap behavior for elements not selected 4088 /// for storage to memory are implementation dependent. 4089 /// 4090 /// \headerfile <x86intrin.h> 4091 /// 4092 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 4093 /// instruction. 4094 /// 4095 /// \param __d 4096 /// A 128-bit integer vector containing the values to be moved. 4097 /// \param __n 4098 /// A 128-bit integer vector containing the mask. The most significant bit of 4099 /// each byte represents the mask bits. 4100 /// \param __p 4101 /// A pointer to an unaligned 128-bit memory location where the specified 4102 /// values are moved. 4103 static __inline__ void __DEFAULT_FN_ATTRS 4104 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) 4105 { 4106 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 4107 } 4108 4109 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 4110 /// a memory location. 4111 /// 4112 /// \headerfile <x86intrin.h> 4113 /// 4114 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 4115 /// 4116 /// \param __p 4117 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 4118 /// of the integer vector parameter. 4119 /// \param __a 4120 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 4121 /// value to be stored. 4122 static __inline__ void __DEFAULT_FN_ATTRS 4123 _mm_storel_epi64(__m128i_u *__p, __m128i __a) 4124 { 4125 struct __mm_storel_epi64_struct { 4126 long long __u; 4127 } __attribute__((__packed__, __may_alias__)); 4128 ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; 4129 } 4130 4131 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 4132 /// aligned memory location. 4133 /// 4134 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4135 /// used again soon). 4136 /// 4137 /// \headerfile <x86intrin.h> 4138 /// 4139 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4140 /// 4141 /// \param __p 4142 /// A pointer to the 128-bit aligned memory location used to store the value. 4143 /// \param __a 4144 /// A vector of [2 x double] containing the 64-bit values to be stored. 4145 static __inline__ void __DEFAULT_FN_ATTRS 4146 _mm_stream_pd(double *__p, __m128d __a) 4147 { 4148 __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); 4149 } 4150 4151 /// Stores a 128-bit integer vector to a 128-bit aligned memory location. 4152 /// 4153 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4154 /// used again soon). 4155 /// 4156 /// \headerfile <x86intrin.h> 4157 /// 4158 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4159 /// 4160 /// \param __p 4161 /// A pointer to the 128-bit aligned memory location used to store the value. 4162 /// \param __a 4163 /// A 128-bit integer vector containing the values to be stored. 4164 static __inline__ void __DEFAULT_FN_ATTRS 4165 _mm_stream_si128(__m128i *__p, __m128i __a) 4166 { 4167 __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); 4168 } 4169 4170 /// Stores a 32-bit integer value in the specified memory location. 4171 /// 4172 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4173 /// used again soon). 4174 /// 4175 /// \headerfile <x86intrin.h> 4176 /// 4177 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 4178 /// 4179 /// \param __p 4180 /// A pointer to the 32-bit memory location used to store the value. 4181 /// \param __a 4182 /// A 32-bit integer containing the value to be stored. 4183 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4184 _mm_stream_si32(int *__p, int __a) 4185 { 4186 __builtin_ia32_movnti(__p, __a); 4187 } 4188 4189 #ifdef __x86_64__ 4190 /// Stores a 64-bit integer value in the specified memory location. 4191 /// 4192 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4193 /// used again soon). 4194 /// 4195 /// \headerfile <x86intrin.h> 4196 /// 4197 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4198 /// 4199 /// \param __p 4200 /// A pointer to the 64-bit memory location used to store the value. 4201 /// \param __a 4202 /// A 64-bit integer containing the value to be stored. 4203 static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4204 _mm_stream_si64(long long *__p, long long __a) 4205 { 4206 __builtin_ia32_movnti64(__p, __a); 4207 } 4208 #endif 4209 4210 #if defined(__cplusplus) 4211 extern "C" { 4212 #endif 4213 4214 /// The cache line containing \a __p is flushed and invalidated from all 4215 /// caches in the coherency domain. 4216 /// 4217 /// \headerfile <x86intrin.h> 4218 /// 4219 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4220 /// 4221 /// \param __p 4222 /// A pointer to the memory location used to identify the cache line to be 4223 /// flushed. 4224 void _mm_clflush(void const * __p); 4225 4226 /// Forces strong memory ordering (serialization) between load 4227 /// instructions preceding this instruction and load instructions following 4228 /// this instruction, ensuring the system completes all previous loads before 4229 /// executing subsequent loads. 4230 /// 4231 /// \headerfile <x86intrin.h> 4232 /// 4233 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4234 /// 4235 void _mm_lfence(void); 4236 4237 /// Forces strong memory ordering (serialization) between load and store 4238 /// instructions preceding this instruction and load and store instructions 4239 /// following this instruction, ensuring that the system completes all 4240 /// previous memory accesses before executing subsequent memory accesses. 4241 /// 4242 /// \headerfile <x86intrin.h> 4243 /// 4244 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4245 /// 4246 void _mm_mfence(void); 4247 4248 #if defined(__cplusplus) 4249 } // extern "C" 4250 #endif 4251 4252 /// Converts 16-bit signed integers from both 128-bit integer vector 4253 /// operands into 8-bit signed integers, and packs the results into the 4254 /// destination. Positive values greater than 0x7F are saturated to 0x7F. 4255 /// Negative values less than 0x80 are saturated to 0x80. 4256 /// 4257 /// \headerfile <x86intrin.h> 4258 /// 4259 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4260 /// 4261 /// \param __a 4262 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4263 /// a signed integer and is converted to a 8-bit signed integer with 4264 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4265 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4266 /// written to the lower 64 bits of the result. 4267 /// \param __b 4268 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4269 /// a signed integer and is converted to a 8-bit signed integer with 4270 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4271 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4272 /// written to the higher 64 bits of the result. 4273 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4274 static __inline__ __m128i __DEFAULT_FN_ATTRS 4275 _mm_packs_epi16(__m128i __a, __m128i __b) 4276 { 4277 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4278 } 4279 4280 /// Converts 32-bit signed integers from both 128-bit integer vector 4281 /// operands into 16-bit signed integers, and packs the results into the 4282 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4283 /// Negative values less than 0x8000 are saturated to 0x8000. 4284 /// 4285 /// \headerfile <x86intrin.h> 4286 /// 4287 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4288 /// 4289 /// \param __a 4290 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4291 /// a signed integer and is converted to a 16-bit signed integer with 4292 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4293 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4294 /// are written to the lower 64 bits of the result. 4295 /// \param __b 4296 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4297 /// a signed integer and is converted to a 16-bit signed integer with 4298 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4299 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4300 /// are written to the higher 64 bits of the result. 4301 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4302 static __inline__ __m128i __DEFAULT_FN_ATTRS 4303 _mm_packs_epi32(__m128i __a, __m128i __b) 4304 { 4305 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4306 } 4307 4308 /// Converts 16-bit signed integers from both 128-bit integer vector 4309 /// operands into 8-bit unsigned integers, and packs the results into the 4310 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4311 /// than 0x00 are saturated to 0x00. 4312 /// 4313 /// \headerfile <x86intrin.h> 4314 /// 4315 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4316 /// 4317 /// \param __a 4318 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4319 /// a signed integer and is converted to an 8-bit unsigned integer with 4320 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4321 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4322 /// written to the lower 64 bits of the result. 4323 /// \param __b 4324 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4325 /// a signed integer and is converted to an 8-bit unsigned integer with 4326 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4327 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4328 /// written to the higher 64 bits of the result. 4329 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4330 static __inline__ __m128i __DEFAULT_FN_ATTRS 4331 _mm_packus_epi16(__m128i __a, __m128i __b) 4332 { 4333 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4334 } 4335 4336 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4337 /// the immediate-value parameter as a selector. 4338 /// 4339 /// \headerfile <x86intrin.h> 4340 /// 4341 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4342 /// 4343 /// \param __a 4344 /// A 128-bit integer vector. 4345 /// \param __imm 4346 /// An immediate value. Bits [2:0] selects values from \a __a to be assigned 4347 /// to bits[15:0] of the result. \n 4348 /// 000: assign values from bits [15:0] of \a __a. \n 4349 /// 001: assign values from bits [31:16] of \a __a. \n 4350 /// 010: assign values from bits [47:32] of \a __a. \n 4351 /// 011: assign values from bits [63:48] of \a __a. \n 4352 /// 100: assign values from bits [79:64] of \a __a. \n 4353 /// 101: assign values from bits [95:80] of \a __a. \n 4354 /// 110: assign values from bits [111:96] of \a __a. \n 4355 /// 111: assign values from bits [127:112] of \a __a. 4356 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4357 /// integer vector parameter and the remaining bits are assigned zeros. 4358 #define _mm_extract_epi16(a, imm) \ 4359 (int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4360 (int)(imm)) 4361 4362 /// Constructs a 128-bit integer vector by first making a copy of the 4363 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4364 /// of an integer parameter into an offset specified by the immediate-value 4365 /// parameter. 4366 /// 4367 /// \headerfile <x86intrin.h> 4368 /// 4369 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4370 /// 4371 /// \param __a 4372 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4373 /// result and then one of the eight elements in the result is replaced by 4374 /// the lower 16 bits of \a __b. 4375 /// \param __b 4376 /// An integer. The lower 16 bits of this parameter are written to the 4377 /// result beginning at an offset specified by \a __imm. 4378 /// \param __imm 4379 /// An immediate value specifying the bit offset in the result at which the 4380 /// lower 16 bits of \a __b are written. 4381 /// \returns A 128-bit integer vector containing the constructed values. 4382 #define _mm_insert_epi16(a, b, imm) \ 4383 (__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4384 (int)(imm)) 4385 4386 /// Copies the values of the most significant bits from each 8-bit 4387 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4388 /// value, zero-extends the value, and writes it to the destination. 4389 /// 4390 /// \headerfile <x86intrin.h> 4391 /// 4392 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4393 /// 4394 /// \param __a 4395 /// A 128-bit integer vector containing the values with bits to be extracted. 4396 /// \returns The most significant bits from each 8-bit element in \a __a, 4397 /// written to bits [15:0]. The other bits are assigned zeros. 4398 static __inline__ int __DEFAULT_FN_ATTRS 4399 _mm_movemask_epi8(__m128i __a) 4400 { 4401 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4402 } 4403 4404 /// Constructs a 128-bit integer vector by shuffling four 32-bit 4405 /// elements of a 128-bit integer vector parameter, using the immediate-value 4406 /// parameter as a specifier. 4407 /// 4408 /// \headerfile <x86intrin.h> 4409 /// 4410 /// \code 4411 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4412 /// \endcode 4413 /// 4414 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4415 /// 4416 /// \param a 4417 /// A 128-bit integer vector containing the values to be copied. 4418 /// \param imm 4419 /// An immediate value containing an 8-bit value specifying which elements to 4420 /// copy from a. The destinations within the 128-bit destination are assigned 4421 /// values as follows: \n 4422 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4423 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4424 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4425 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4426 /// Bit value assignments: \n 4427 /// 00: assign values from bits [31:0] of \a a. \n 4428 /// 01: assign values from bits [63:32] of \a a. \n 4429 /// 10: assign values from bits [95:64] of \a a. \n 4430 /// 11: assign values from bits [127:96] of \a a. 4431 /// \returns A 128-bit integer vector containing the shuffled values. 4432 #define _mm_shuffle_epi32(a, imm) \ 4433 (__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)) 4434 4435 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4436 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4437 /// value parameter as a specifier. 4438 /// 4439 /// \headerfile <x86intrin.h> 4440 /// 4441 /// \code 4442 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4443 /// \endcode 4444 /// 4445 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4446 /// 4447 /// \param a 4448 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4449 /// [127:64] of the result. 4450 /// \param imm 4451 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4452 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4453 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4454 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4455 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4456 /// Bit value assignments: \n 4457 /// 00: assign values from bits [15:0] of \a a. \n 4458 /// 01: assign values from bits [31:16] of \a a. \n 4459 /// 10: assign values from bits [47:32] of \a a. \n 4460 /// 11: assign values from bits [63:48] of \a a. \n 4461 /// \returns A 128-bit integer vector containing the shuffled values. 4462 #define _mm_shufflelo_epi16(a, imm) \ 4463 (__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)) 4464 4465 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4466 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4467 /// value parameter as a specifier. 4468 /// 4469 /// \headerfile <x86intrin.h> 4470 /// 4471 /// \code 4472 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4473 /// \endcode 4474 /// 4475 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4476 /// 4477 /// \param a 4478 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4479 /// [63:0] of the result. 4480 /// \param imm 4481 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4482 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4483 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4484 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4485 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4486 /// Bit value assignments: \n 4487 /// 00: assign values from bits [79:64] of \a a. \n 4488 /// 01: assign values from bits [95:80] of \a a. \n 4489 /// 10: assign values from bits [111:96] of \a a. \n 4490 /// 11: assign values from bits [127:112] of \a a. \n 4491 /// \returns A 128-bit integer vector containing the shuffled values. 4492 #define _mm_shufflehi_epi16(a, imm) \ 4493 (__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)) 4494 4495 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4496 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4497 /// 4498 /// \headerfile <x86intrin.h> 4499 /// 4500 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4501 /// instruction. 4502 /// 4503 /// \param __a 4504 /// A 128-bit vector of [16 x i8]. 4505 /// Bits [71:64] are written to bits [7:0] of the result. \n 4506 /// Bits [79:72] are written to bits [23:16] of the result. \n 4507 /// Bits [87:80] are written to bits [39:32] of the result. \n 4508 /// Bits [95:88] are written to bits [55:48] of the result. \n 4509 /// Bits [103:96] are written to bits [71:64] of the result. \n 4510 /// Bits [111:104] are written to bits [87:80] of the result. \n 4511 /// Bits [119:112] are written to bits [103:96] of the result. \n 4512 /// Bits [127:120] are written to bits [119:112] of the result. 4513 /// \param __b 4514 /// A 128-bit vector of [16 x i8]. \n 4515 /// Bits [71:64] are written to bits [15:8] of the result. \n 4516 /// Bits [79:72] are written to bits [31:24] of the result. \n 4517 /// Bits [87:80] are written to bits [47:40] of the result. \n 4518 /// Bits [95:88] are written to bits [63:56] of the result. \n 4519 /// Bits [103:96] are written to bits [79:72] of the result. \n 4520 /// Bits [111:104] are written to bits [95:88] of the result. \n 4521 /// Bits [119:112] are written to bits [111:104] of the result. \n 4522 /// Bits [127:120] are written to bits [127:120] of the result. 4523 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4524 static __inline__ __m128i __DEFAULT_FN_ATTRS 4525 _mm_unpackhi_epi8(__m128i __a, __m128i __b) 4526 { 4527 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); 4528 } 4529 4530 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4531 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4532 /// 4533 /// \headerfile <x86intrin.h> 4534 /// 4535 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4536 /// instruction. 4537 /// 4538 /// \param __a 4539 /// A 128-bit vector of [8 x i16]. 4540 /// Bits [79:64] are written to bits [15:0] of the result. \n 4541 /// Bits [95:80] are written to bits [47:32] of the result. \n 4542 /// Bits [111:96] are written to bits [79:64] of the result. \n 4543 /// Bits [127:112] are written to bits [111:96] of the result. 4544 /// \param __b 4545 /// A 128-bit vector of [8 x i16]. 4546 /// Bits [79:64] are written to bits [31:16] of the result. \n 4547 /// Bits [95:80] are written to bits [63:48] of the result. \n 4548 /// Bits [111:96] are written to bits [95:80] of the result. \n 4549 /// Bits [127:112] are written to bits [127:112] of the result. 4550 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4551 static __inline__ __m128i __DEFAULT_FN_ATTRS 4552 _mm_unpackhi_epi16(__m128i __a, __m128i __b) 4553 { 4554 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); 4555 } 4556 4557 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4558 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4559 /// 4560 /// \headerfile <x86intrin.h> 4561 /// 4562 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4563 /// instruction. 4564 /// 4565 /// \param __a 4566 /// A 128-bit vector of [4 x i32]. \n 4567 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4568 /// Bits [127:96] are written to bits [95:64] of the destination. 4569 /// \param __b 4570 /// A 128-bit vector of [4 x i32]. \n 4571 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4572 /// Bits [127:96] are written to bits [127:96] of the destination. 4573 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4574 static __inline__ __m128i __DEFAULT_FN_ATTRS 4575 _mm_unpackhi_epi32(__m128i __a, __m128i __b) 4576 { 4577 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); 4578 } 4579 4580 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4581 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4582 /// 4583 /// \headerfile <x86intrin.h> 4584 /// 4585 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4586 /// instruction. 4587 /// 4588 /// \param __a 4589 /// A 128-bit vector of [2 x i64]. \n 4590 /// Bits [127:64] are written to bits [63:0] of the destination. 4591 /// \param __b 4592 /// A 128-bit vector of [2 x i64]. \n 4593 /// Bits [127:64] are written to bits [127:64] of the destination. 4594 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4595 static __inline__ __m128i __DEFAULT_FN_ATTRS 4596 _mm_unpackhi_epi64(__m128i __a, __m128i __b) 4597 { 4598 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); 4599 } 4600 4601 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4602 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4603 /// 4604 /// \headerfile <x86intrin.h> 4605 /// 4606 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4607 /// instruction. 4608 /// 4609 /// \param __a 4610 /// A 128-bit vector of [16 x i8]. \n 4611 /// Bits [7:0] are written to bits [7:0] of the result. \n 4612 /// Bits [15:8] are written to bits [23:16] of the result. \n 4613 /// Bits [23:16] are written to bits [39:32] of the result. \n 4614 /// Bits [31:24] are written to bits [55:48] of the result. \n 4615 /// Bits [39:32] are written to bits [71:64] of the result. \n 4616 /// Bits [47:40] are written to bits [87:80] of the result. \n 4617 /// Bits [55:48] are written to bits [103:96] of the result. \n 4618 /// Bits [63:56] are written to bits [119:112] of the result. 4619 /// \param __b 4620 /// A 128-bit vector of [16 x i8]. 4621 /// Bits [7:0] are written to bits [15:8] of the result. \n 4622 /// Bits [15:8] are written to bits [31:24] of the result. \n 4623 /// Bits [23:16] are written to bits [47:40] of the result. \n 4624 /// Bits [31:24] are written to bits [63:56] of the result. \n 4625 /// Bits [39:32] are written to bits [79:72] of the result. \n 4626 /// Bits [47:40] are written to bits [95:88] of the result. \n 4627 /// Bits [55:48] are written to bits [111:104] of the result. \n 4628 /// Bits [63:56] are written to bits [127:120] of the result. 4629 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4630 static __inline__ __m128i __DEFAULT_FN_ATTRS 4631 _mm_unpacklo_epi8(__m128i __a, __m128i __b) 4632 { 4633 return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); 4634 } 4635 4636 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4637 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4638 /// [8 x i16]. 4639 /// 4640 /// \headerfile <x86intrin.h> 4641 /// 4642 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4643 /// instruction. 4644 /// 4645 /// \param __a 4646 /// A 128-bit vector of [8 x i16]. 4647 /// Bits [15:0] are written to bits [15:0] of the result. \n 4648 /// Bits [31:16] are written to bits [47:32] of the result. \n 4649 /// Bits [47:32] are written to bits [79:64] of the result. \n 4650 /// Bits [63:48] are written to bits [111:96] of the result. 4651 /// \param __b 4652 /// A 128-bit vector of [8 x i16]. 4653 /// Bits [15:0] are written to bits [31:16] of the result. \n 4654 /// Bits [31:16] are written to bits [63:48] of the result. \n 4655 /// Bits [47:32] are written to bits [95:80] of the result. \n 4656 /// Bits [63:48] are written to bits [127:112] of the result. 4657 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4658 static __inline__ __m128i __DEFAULT_FN_ATTRS 4659 _mm_unpacklo_epi16(__m128i __a, __m128i __b) 4660 { 4661 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); 4662 } 4663 4664 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4665 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4666 /// 4667 /// \headerfile <x86intrin.h> 4668 /// 4669 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4670 /// instruction. 4671 /// 4672 /// \param __a 4673 /// A 128-bit vector of [4 x i32]. \n 4674 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4675 /// Bits [63:32] are written to bits [95:64] of the destination. 4676 /// \param __b 4677 /// A 128-bit vector of [4 x i32]. \n 4678 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4679 /// Bits [63:32] are written to bits [127:96] of the destination. 4680 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4681 static __inline__ __m128i __DEFAULT_FN_ATTRS 4682 _mm_unpacklo_epi32(__m128i __a, __m128i __b) 4683 { 4684 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); 4685 } 4686 4687 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4688 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4689 /// 4690 /// \headerfile <x86intrin.h> 4691 /// 4692 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4693 /// instruction. 4694 /// 4695 /// \param __a 4696 /// A 128-bit vector of [2 x i64]. \n 4697 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4698 /// \param __b 4699 /// A 128-bit vector of [2 x i64]. \n 4700 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4701 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4702 static __inline__ __m128i __DEFAULT_FN_ATTRS 4703 _mm_unpacklo_epi64(__m128i __a, __m128i __b) 4704 { 4705 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); 4706 } 4707 4708 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4709 /// integer. 4710 /// 4711 /// \headerfile <x86intrin.h> 4712 /// 4713 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4714 /// 4715 /// \param __a 4716 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4717 /// destination. 4718 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4719 static __inline__ __m64 __DEFAULT_FN_ATTRS 4720 _mm_movepi64_pi64(__m128i __a) 4721 { 4722 return (__m64)__a[0]; 4723 } 4724 4725 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4726 /// upper bits. 4727 /// 4728 /// \headerfile <x86intrin.h> 4729 /// 4730 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4731 /// 4732 /// \param __a 4733 /// A 64-bit value. 4734 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4735 /// the operand. The upper 64 bits are assigned zeros. 4736 static __inline__ __m128i __DEFAULT_FN_ATTRS 4737 _mm_movpi64_epi64(__m64 __a) 4738 { 4739 return __extension__ (__m128i)(__v2di){ (long long)__a, 0 }; 4740 } 4741 4742 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4743 /// integer vector, zeroing the upper bits. 4744 /// 4745 /// \headerfile <x86intrin.h> 4746 /// 4747 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4748 /// 4749 /// \param __a 4750 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4751 /// destination. 4752 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4753 /// the operand. The upper 64 bits are assigned zeros. 4754 static __inline__ __m128i __DEFAULT_FN_ATTRS 4755 _mm_move_epi64(__m128i __a) 4756 { 4757 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4758 } 4759 4760 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4761 /// [2 x double] and interleaves them into a 128-bit vector of [2 x 4762 /// double]. 4763 /// 4764 /// \headerfile <x86intrin.h> 4765 /// 4766 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4767 /// 4768 /// \param __a 4769 /// A 128-bit vector of [2 x double]. \n 4770 /// Bits [127:64] are written to bits [63:0] of the destination. 4771 /// \param __b 4772 /// A 128-bit vector of [2 x double]. \n 4773 /// Bits [127:64] are written to bits [127:64] of the destination. 4774 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4775 static __inline__ __m128d __DEFAULT_FN_ATTRS 4776 _mm_unpackhi_pd(__m128d __a, __m128d __b) 4777 { 4778 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); 4779 } 4780 4781 /// Unpacks the low-order 64-bit elements from two 128-bit vectors 4782 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4783 /// double]. 4784 /// 4785 /// \headerfile <x86intrin.h> 4786 /// 4787 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4788 /// 4789 /// \param __a 4790 /// A 128-bit vector of [2 x double]. \n 4791 /// Bits [63:0] are written to bits [63:0] of the destination. 4792 /// \param __b 4793 /// A 128-bit vector of [2 x double]. \n 4794 /// Bits [63:0] are written to bits [127:64] of the destination. 4795 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4796 static __inline__ __m128d __DEFAULT_FN_ATTRS 4797 _mm_unpacklo_pd(__m128d __a, __m128d __b) 4798 { 4799 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); 4800 } 4801 4802 /// Extracts the sign bits of the double-precision values in the 128-bit 4803 /// vector of [2 x double], zero-extends the value, and writes it to the 4804 /// low-order bits of the destination. 4805 /// 4806 /// \headerfile <x86intrin.h> 4807 /// 4808 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4809 /// 4810 /// \param __a 4811 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4812 /// be extracted. 4813 /// \returns The sign bits from each of the double-precision elements in \a __a, 4814 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4815 static __inline__ int __DEFAULT_FN_ATTRS 4816 _mm_movemask_pd(__m128d __a) 4817 { 4818 return __builtin_ia32_movmskpd((__v2df)__a); 4819 } 4820 4821 4822 /// Constructs a 128-bit floating-point vector of [2 x double] from two 4823 /// 128-bit vector parameters of [2 x double], using the immediate-value 4824 /// parameter as a specifier. 4825 /// 4826 /// \headerfile <x86intrin.h> 4827 /// 4828 /// \code 4829 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4830 /// \endcode 4831 /// 4832 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4833 /// 4834 /// \param a 4835 /// A 128-bit vector of [2 x double]. 4836 /// \param b 4837 /// A 128-bit vector of [2 x double]. 4838 /// \param i 4839 /// An 8-bit immediate value. The least significant two bits specify which 4840 /// elements to copy from \a a and \a b: \n 4841 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4842 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4843 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4844 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4845 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4846 #define _mm_shuffle_pd(a, b, i) \ 4847 (__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4848 (int)(i)) 4849 4850 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4851 /// floating-point vector of [4 x float]. 4852 /// 4853 /// \headerfile <x86intrin.h> 4854 /// 4855 /// This intrinsic has no corresponding instruction. 4856 /// 4857 /// \param __a 4858 /// A 128-bit floating-point vector of [2 x double]. 4859 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4860 /// bitwise pattern as the parameter. 4861 static __inline__ __m128 __DEFAULT_FN_ATTRS 4862 _mm_castpd_ps(__m128d __a) 4863 { 4864 return (__m128)__a; 4865 } 4866 4867 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4868 /// integer vector. 4869 /// 4870 /// \headerfile <x86intrin.h> 4871 /// 4872 /// This intrinsic has no corresponding instruction. 4873 /// 4874 /// \param __a 4875 /// A 128-bit floating-point vector of [2 x double]. 4876 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4877 /// parameter. 4878 static __inline__ __m128i __DEFAULT_FN_ATTRS 4879 _mm_castpd_si128(__m128d __a) 4880 { 4881 return (__m128i)__a; 4882 } 4883 4884 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4885 /// floating-point vector of [2 x double]. 4886 /// 4887 /// \headerfile <x86intrin.h> 4888 /// 4889 /// This intrinsic has no corresponding instruction. 4890 /// 4891 /// \param __a 4892 /// A 128-bit floating-point vector of [4 x float]. 4893 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4894 /// bitwise pattern as the parameter. 4895 static __inline__ __m128d __DEFAULT_FN_ATTRS 4896 _mm_castps_pd(__m128 __a) 4897 { 4898 return (__m128d)__a; 4899 } 4900 4901 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4902 /// integer vector. 4903 /// 4904 /// \headerfile <x86intrin.h> 4905 /// 4906 /// This intrinsic has no corresponding instruction. 4907 /// 4908 /// \param __a 4909 /// A 128-bit floating-point vector of [4 x float]. 4910 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4911 /// parameter. 4912 static __inline__ __m128i __DEFAULT_FN_ATTRS 4913 _mm_castps_si128(__m128 __a) 4914 { 4915 return (__m128i)__a; 4916 } 4917 4918 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4919 /// of [4 x float]. 4920 /// 4921 /// \headerfile <x86intrin.h> 4922 /// 4923 /// This intrinsic has no corresponding instruction. 4924 /// 4925 /// \param __a 4926 /// A 128-bit integer vector. 4927 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4928 /// bitwise pattern as the parameter. 4929 static __inline__ __m128 __DEFAULT_FN_ATTRS 4930 _mm_castsi128_ps(__m128i __a) 4931 { 4932 return (__m128)__a; 4933 } 4934 4935 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4936 /// of [2 x double]. 4937 /// 4938 /// \headerfile <x86intrin.h> 4939 /// 4940 /// This intrinsic has no corresponding instruction. 4941 /// 4942 /// \param __a 4943 /// A 128-bit integer vector. 4944 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4945 /// bitwise pattern as the parameter. 4946 static __inline__ __m128d __DEFAULT_FN_ATTRS 4947 _mm_castsi128_pd(__m128i __a) 4948 { 4949 return (__m128d)__a; 4950 } 4951 4952 #if defined(__cplusplus) 4953 extern "C" { 4954 #endif 4955 4956 /// Indicates that a spin loop is being executed for the purposes of 4957 /// optimizing power consumption during the loop. 4958 /// 4959 /// \headerfile <x86intrin.h> 4960 /// 4961 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4962 /// 4963 void _mm_pause(void); 4964 4965 #if defined(__cplusplus) 4966 } // extern "C" 4967 #endif 4968 #undef __DEFAULT_FN_ATTRS 4969 #undef __DEFAULT_FN_ATTRS_MMX 4970 4971 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4972 4973 #define _MM_DENORMALS_ZERO_ON (0x0040) 4974 #define _MM_DENORMALS_ZERO_OFF (0x0000) 4975 4976 #define _MM_DENORMALS_ZERO_MASK (0x0040) 4977 4978 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4979 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4980 4981 #endif /* __EMMINTRIN_H */ 4982