1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __EMMINTRIN_H 11 #define __EMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <xmmintrin.h> 18 19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 21 22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 23 typedef long long __m128i_u 24 __attribute__((__vector_size__(16), __aligned__(1))); 25 26 /* Type defines. */ 27 typedef double __v2df __attribute__((__vector_size__(16))); 28 typedef long long __v2di __attribute__((__vector_size__(16))); 29 typedef short __v8hi __attribute__((__vector_size__(16))); 30 typedef char __v16qi __attribute__((__vector_size__(16))); 31 32 /* Unsigned types */ 33 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 34 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 35 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 36 37 /* We need an explicitly signed variant for char. Note that this shouldn't 38 * appear in the interface though. */ 39 typedef signed char __v16qs __attribute__((__vector_size__(16))); 40 41 #ifdef __SSE2__ 42 /* Both _Float16 and __bf16 require SSE2 being enabled. */ 43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); 44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); 45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); 46 47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16))); 48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); 49 #endif 50 51 /* Define the default attributes for the functions in this file. */ 52 #define __DEFAULT_FN_ATTRS \ 53 __attribute__((__always_inline__, __nodebug__, \ 54 __target__("sse2,no-evex512"), __min_vector_width__(128))) 55 #define __DEFAULT_FN_ATTRS_MMX \ 56 __attribute__((__always_inline__, __nodebug__, \ 57 __target__("mmx,sse2,no-evex512"), __min_vector_width__(64))) 58 59 /// Adds lower double-precision values in both operands and returns the 60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 61 /// are copied from the upper double-precision value of the first operand. 62 /// 63 /// \headerfile <x86intrin.h> 64 /// 65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 66 /// 67 /// \param __a 68 /// A 128-bit vector of [2 x double] containing one of the source operands. 69 /// \param __b 70 /// A 128-bit vector of [2 x double] containing one of the source operands. 71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 73 /// from the upper 64 bits of the first source operand. 74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, 75 __m128d __b) { 76 __a[0] += __b[0]; 77 return __a; 78 } 79 80 /// Adds two 128-bit vectors of [2 x double]. 81 /// 82 /// \headerfile <x86intrin.h> 83 /// 84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 85 /// 86 /// \param __a 87 /// A 128-bit vector of [2 x double] containing one of the source operands. 88 /// \param __b 89 /// A 128-bit vector of [2 x double] containing one of the source operands. 90 /// \returns A 128-bit vector of [2 x double] containing the sums of both 91 /// operands. 92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, 93 __m128d __b) { 94 return (__m128d)((__v2df)__a + (__v2df)__b); 95 } 96 97 /// Subtracts the lower double-precision value of the second operand 98 /// from the lower double-precision value of the first operand and returns 99 /// the difference in the lower 64 bits of the result. The upper 64 bits of 100 /// the result are copied from the upper double-precision value of the first 101 /// operand. 102 /// 103 /// \headerfile <x86intrin.h> 104 /// 105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 106 /// 107 /// \param __a 108 /// A 128-bit vector of [2 x double] containing the minuend. 109 /// \param __b 110 /// A 128-bit vector of [2 x double] containing the subtrahend. 111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 112 /// difference of the lower 64 bits of both operands. The upper 64 bits are 113 /// copied from the upper 64 bits of the first source operand. 114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, 115 __m128d __b) { 116 __a[0] -= __b[0]; 117 return __a; 118 } 119 120 /// Subtracts two 128-bit vectors of [2 x double]. 121 /// 122 /// \headerfile <x86intrin.h> 123 /// 124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 125 /// 126 /// \param __a 127 /// A 128-bit vector of [2 x double] containing the minuend. 128 /// \param __b 129 /// A 128-bit vector of [2 x double] containing the subtrahend. 130 /// \returns A 128-bit vector of [2 x double] containing the differences between 131 /// both operands. 132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, 133 __m128d __b) { 134 return (__m128d)((__v2df)__a - (__v2df)__b); 135 } 136 137 /// Multiplies lower double-precision values in both operands and returns 138 /// the product in the lower 64 bits of the result. The upper 64 bits of the 139 /// result are copied from the upper double-precision value of the first 140 /// operand. 141 /// 142 /// \headerfile <x86intrin.h> 143 /// 144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 145 /// 146 /// \param __a 147 /// A 128-bit vector of [2 x double] containing one of the source operands. 148 /// \param __b 149 /// A 128-bit vector of [2 x double] containing one of the source operands. 150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 151 /// product of the lower 64 bits of both operands. The upper 64 bits are 152 /// copied from the upper 64 bits of the first source operand. 153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, 154 __m128d __b) { 155 __a[0] *= __b[0]; 156 return __a; 157 } 158 159 /// Multiplies two 128-bit vectors of [2 x double]. 160 /// 161 /// \headerfile <x86intrin.h> 162 /// 163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 164 /// 165 /// \param __a 166 /// A 128-bit vector of [2 x double] containing one of the operands. 167 /// \param __b 168 /// A 128-bit vector of [2 x double] containing one of the operands. 169 /// \returns A 128-bit vector of [2 x double] containing the products of both 170 /// operands. 171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, 172 __m128d __b) { 173 return (__m128d)((__v2df)__a * (__v2df)__b); 174 } 175 176 /// Divides the lower double-precision value of the first operand by the 177 /// lower double-precision value of the second operand and returns the 178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 179 /// result are copied from the upper double-precision value of the first 180 /// operand. 181 /// 182 /// \headerfile <x86intrin.h> 183 /// 184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 185 /// 186 /// \param __a 187 /// A 128-bit vector of [2 x double] containing the dividend. 188 /// \param __b 189 /// A 128-bit vector of [2 x double] containing divisor. 190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 192 /// copied from the upper 64 bits of the first source operand. 193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, 194 __m128d __b) { 195 __a[0] /= __b[0]; 196 return __a; 197 } 198 199 /// Performs an element-by-element division of two 128-bit vectors of 200 /// [2 x double]. 201 /// 202 /// \headerfile <x86intrin.h> 203 /// 204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 205 /// 206 /// \param __a 207 /// A 128-bit vector of [2 x double] containing the dividend. 208 /// \param __b 209 /// A 128-bit vector of [2 x double] containing the divisor. 210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 211 /// operands. 212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, 213 __m128d __b) { 214 return (__m128d)((__v2df)__a / (__v2df)__b); 215 } 216 217 /// Calculates the square root of the lower double-precision value of 218 /// the second operand and returns it in the lower 64 bits of the result. 219 /// The upper 64 bits of the result are copied from the upper 220 /// double-precision value of the first operand. 221 /// 222 /// \headerfile <x86intrin.h> 223 /// 224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 225 /// 226 /// \param __a 227 /// A 128-bit vector of [2 x double] containing one of the operands. The 228 /// upper 64 bits of this operand are copied to the upper 64 bits of the 229 /// result. 230 /// \param __b 231 /// A 128-bit vector of [2 x double] containing one of the operands. The 232 /// square root is calculated using the lower 64 bits of this operand. 233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 235 /// bits are copied from the upper 64 bits of operand \a __a. 236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, 237 __m128d __b) { 238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 239 return __extension__(__m128d){__c[0], __a[1]}; 240 } 241 242 /// Calculates the square root of the each of two values stored in a 243 /// 128-bit vector of [2 x double]. 244 /// 245 /// \headerfile <x86intrin.h> 246 /// 247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 248 /// 249 /// \param __a 250 /// A 128-bit vector of [2 x double]. 251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 252 /// values in the operand. 253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { 254 return __builtin_ia32_sqrtpd((__v2df)__a); 255 } 256 257 /// Compares lower 64-bit double-precision values of both operands, and 258 /// returns the lesser of the pair of values in the lower 64-bits of the 259 /// result. The upper 64 bits of the result are copied from the upper 260 /// double-precision value of the first operand. 261 /// 262 /// If either value in a comparison is NaN, returns the value from \a __b. 263 /// 264 /// \headerfile <x86intrin.h> 265 /// 266 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 267 /// 268 /// \param __a 269 /// A 128-bit vector of [2 x double] containing one of the operands. The 270 /// lower 64 bits of this operand are used in the comparison. 271 /// \param __b 272 /// A 128-bit vector of [2 x double] containing one of the operands. The 273 /// lower 64 bits of this operand are used in the comparison. 274 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 275 /// minimum value between both operands. The upper 64 bits are copied from 276 /// the upper 64 bits of the first source operand. 277 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, 278 __m128d __b) { 279 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 280 } 281 282 /// Performs element-by-element comparison of the two 128-bit vectors of 283 /// [2 x double] and returns a vector containing the lesser of each pair of 284 /// values. 285 /// 286 /// If either value in a comparison is NaN, returns the value from \a __b. 287 /// 288 /// \headerfile <x86intrin.h> 289 /// 290 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 291 /// 292 /// \param __a 293 /// A 128-bit vector of [2 x double] containing one of the operands. 294 /// \param __b 295 /// A 128-bit vector of [2 x double] containing one of the operands. 296 /// \returns A 128-bit vector of [2 x double] containing the minimum values 297 /// between both operands. 298 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, 299 __m128d __b) { 300 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 301 } 302 303 /// Compares lower 64-bit double-precision values of both operands, and 304 /// returns the greater of the pair of values in the lower 64-bits of the 305 /// result. The upper 64 bits of the result are copied from the upper 306 /// double-precision value of the first operand. 307 /// 308 /// If either value in a comparison is NaN, returns the value from \a __b. 309 /// 310 /// \headerfile <x86intrin.h> 311 /// 312 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 313 /// 314 /// \param __a 315 /// A 128-bit vector of [2 x double] containing one of the operands. The 316 /// lower 64 bits of this operand are used in the comparison. 317 /// \param __b 318 /// A 128-bit vector of [2 x double] containing one of the operands. The 319 /// lower 64 bits of this operand are used in the comparison. 320 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 321 /// maximum value between both operands. The upper 64 bits are copied from 322 /// the upper 64 bits of the first source operand. 323 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, 324 __m128d __b) { 325 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 326 } 327 328 /// Performs element-by-element comparison of the two 128-bit vectors of 329 /// [2 x double] and returns a vector containing the greater of each pair 330 /// of values. 331 /// 332 /// If either value in a comparison is NaN, returns the value from \a __b. 333 /// 334 /// \headerfile <x86intrin.h> 335 /// 336 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 337 /// 338 /// \param __a 339 /// A 128-bit vector of [2 x double] containing one of the operands. 340 /// \param __b 341 /// A 128-bit vector of [2 x double] containing one of the operands. 342 /// \returns A 128-bit vector of [2 x double] containing the maximum values 343 /// between both operands. 344 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, 345 __m128d __b) { 346 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 347 } 348 349 /// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 350 /// 351 /// \headerfile <x86intrin.h> 352 /// 353 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 354 /// 355 /// \param __a 356 /// A 128-bit vector of [2 x double] containing one of the source operands. 357 /// \param __b 358 /// A 128-bit vector of [2 x double] containing one of the source operands. 359 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 360 /// values between both operands. 361 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, 362 __m128d __b) { 363 return (__m128d)((__v2du)__a & (__v2du)__b); 364 } 365 366 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 367 /// the one's complement of the values contained in the first source operand. 368 /// 369 /// \headerfile <x86intrin.h> 370 /// 371 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 372 /// 373 /// \param __a 374 /// A 128-bit vector of [2 x double] containing the left source operand. The 375 /// one's complement of this value is used in the bitwise AND. 376 /// \param __b 377 /// A 128-bit vector of [2 x double] containing the right source operand. 378 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 379 /// values in the second operand and the one's complement of the first 380 /// operand. 381 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, 382 __m128d __b) { 383 return (__m128d)(~(__v2du)__a & (__v2du)__b); 384 } 385 386 /// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 387 /// 388 /// \headerfile <x86intrin.h> 389 /// 390 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 391 /// 392 /// \param __a 393 /// A 128-bit vector of [2 x double] containing one of the source operands. 394 /// \param __b 395 /// A 128-bit vector of [2 x double] containing one of the source operands. 396 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 397 /// values between both operands. 398 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, 399 __m128d __b) { 400 return (__m128d)((__v2du)__a | (__v2du)__b); 401 } 402 403 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 404 /// 405 /// \headerfile <x86intrin.h> 406 /// 407 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 408 /// 409 /// \param __a 410 /// A 128-bit vector of [2 x double] containing one of the source operands. 411 /// \param __b 412 /// A 128-bit vector of [2 x double] containing one of the source operands. 413 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 414 /// values between both operands. 415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, 416 __m128d __b) { 417 return (__m128d)((__v2du)__a ^ (__v2du)__b); 418 } 419 420 /// Compares each of the corresponding double-precision values of the 421 /// 128-bit vectors of [2 x double] for equality. 422 /// 423 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 424 /// If either value in a comparison is NaN, returns false. 425 /// 426 /// \headerfile <x86intrin.h> 427 /// 428 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 429 /// 430 /// \param __a 431 /// A 128-bit vector of [2 x double]. 432 /// \param __b 433 /// A 128-bit vector of [2 x double]. 434 /// \returns A 128-bit vector containing the comparison results. 435 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, 436 __m128d __b) { 437 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 438 } 439 440 /// Compares each of the corresponding double-precision values of the 441 /// 128-bit vectors of [2 x double] to determine if the values in the first 442 /// operand are less than those in the second operand. 443 /// 444 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 445 /// If either value in a comparison is NaN, returns false. 446 /// 447 /// \headerfile <x86intrin.h> 448 /// 449 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 450 /// 451 /// \param __a 452 /// A 128-bit vector of [2 x double]. 453 /// \param __b 454 /// A 128-bit vector of [2 x double]. 455 /// \returns A 128-bit vector containing the comparison results. 456 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, 457 __m128d __b) { 458 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 459 } 460 461 /// Compares each of the corresponding double-precision values of the 462 /// 128-bit vectors of [2 x double] to determine if the values in the first 463 /// operand are less than or equal to those in the second operand. 464 /// 465 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 466 /// If either value in a comparison is NaN, returns false. 467 /// 468 /// \headerfile <x86intrin.h> 469 /// 470 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 471 /// 472 /// \param __a 473 /// A 128-bit vector of [2 x double]. 474 /// \param __b 475 /// A 128-bit vector of [2 x double]. 476 /// \returns A 128-bit vector containing the comparison results. 477 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, 478 __m128d __b) { 479 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 480 } 481 482 /// Compares each of the corresponding double-precision values of the 483 /// 128-bit vectors of [2 x double] to determine if the values in the first 484 /// operand are greater than those in the second operand. 485 /// 486 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 487 /// If either value in a comparison is NaN, returns false. 488 /// 489 /// \headerfile <x86intrin.h> 490 /// 491 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 492 /// 493 /// \param __a 494 /// A 128-bit vector of [2 x double]. 495 /// \param __b 496 /// A 128-bit vector of [2 x double]. 497 /// \returns A 128-bit vector containing the comparison results. 498 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, 499 __m128d __b) { 500 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 501 } 502 503 /// Compares each of the corresponding double-precision values of the 504 /// 128-bit vectors of [2 x double] to determine if the values in the first 505 /// operand are greater than or equal to those in the second operand. 506 /// 507 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 508 /// If either value in a comparison is NaN, returns false. 509 /// 510 /// \headerfile <x86intrin.h> 511 /// 512 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 513 /// 514 /// \param __a 515 /// A 128-bit vector of [2 x double]. 516 /// \param __b 517 /// A 128-bit vector of [2 x double]. 518 /// \returns A 128-bit vector containing the comparison results. 519 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, 520 __m128d __b) { 521 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 522 } 523 524 /// Compares each of the corresponding double-precision values of the 525 /// 128-bit vectors of [2 x double] to determine if the values in the first 526 /// operand are ordered with respect to those in the second operand. 527 /// 528 /// A pair of double-precision values are ordered with respect to each 529 /// other if neither value is a NaN. Each comparison returns 0x0 for false, 530 /// 0xFFFFFFFFFFFFFFFF for true. 531 /// 532 /// \headerfile <x86intrin.h> 533 /// 534 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 535 /// 536 /// \param __a 537 /// A 128-bit vector of [2 x double]. 538 /// \param __b 539 /// A 128-bit vector of [2 x double]. 540 /// \returns A 128-bit vector containing the comparison results. 541 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, 542 __m128d __b) { 543 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 544 } 545 546 /// Compares each of the corresponding double-precision values of the 547 /// 128-bit vectors of [2 x double] to determine if the values in the first 548 /// operand are unordered with respect to those in the second operand. 549 /// 550 /// A pair of double-precision values are unordered with respect to each 551 /// other if one or both values are NaN. Each comparison returns 0x0 for 552 /// false, 0xFFFFFFFFFFFFFFFF for true. 553 /// 554 /// \headerfile <x86intrin.h> 555 /// 556 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 557 /// instruction. 558 /// 559 /// \param __a 560 /// A 128-bit vector of [2 x double]. 561 /// \param __b 562 /// A 128-bit vector of [2 x double]. 563 /// \returns A 128-bit vector containing the comparison results. 564 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, 565 __m128d __b) { 566 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 567 } 568 569 /// Compares each of the corresponding double-precision values of the 570 /// 128-bit vectors of [2 x double] to determine if the values in the first 571 /// operand are unequal to those in the second operand. 572 /// 573 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 574 /// If either value in a comparison is NaN, returns true. 575 /// 576 /// \headerfile <x86intrin.h> 577 /// 578 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 579 /// 580 /// \param __a 581 /// A 128-bit vector of [2 x double]. 582 /// \param __b 583 /// A 128-bit vector of [2 x double]. 584 /// \returns A 128-bit vector containing the comparison results. 585 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, 586 __m128d __b) { 587 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 588 } 589 590 /// Compares each of the corresponding double-precision values of the 591 /// 128-bit vectors of [2 x double] to determine if the values in the first 592 /// operand are not less than those in the second operand. 593 /// 594 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 595 /// If either value in a comparison is NaN, returns true. 596 /// 597 /// \headerfile <x86intrin.h> 598 /// 599 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 600 /// 601 /// \param __a 602 /// A 128-bit vector of [2 x double]. 603 /// \param __b 604 /// A 128-bit vector of [2 x double]. 605 /// \returns A 128-bit vector containing the comparison results. 606 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, 607 __m128d __b) { 608 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 609 } 610 611 /// Compares each of the corresponding double-precision values of the 612 /// 128-bit vectors of [2 x double] to determine if the values in the first 613 /// operand are not less than or equal to those in the second operand. 614 /// 615 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 616 /// If either value in a comparison is NaN, returns true. 617 /// 618 /// \headerfile <x86intrin.h> 619 /// 620 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 621 /// 622 /// \param __a 623 /// A 128-bit vector of [2 x double]. 624 /// \param __b 625 /// A 128-bit vector of [2 x double]. 626 /// \returns A 128-bit vector containing the comparison results. 627 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, 628 __m128d __b) { 629 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 630 } 631 632 /// Compares each of the corresponding double-precision values of the 633 /// 128-bit vectors of [2 x double] to determine if the values in the first 634 /// operand are not greater than those in the second operand. 635 /// 636 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 637 /// If either value in a comparison is NaN, returns true. 638 /// 639 /// \headerfile <x86intrin.h> 640 /// 641 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 642 /// 643 /// \param __a 644 /// A 128-bit vector of [2 x double]. 645 /// \param __b 646 /// A 128-bit vector of [2 x double]. 647 /// \returns A 128-bit vector containing the comparison results. 648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, 649 __m128d __b) { 650 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 651 } 652 653 /// Compares each of the corresponding double-precision values of the 654 /// 128-bit vectors of [2 x double] to determine if the values in the first 655 /// operand are not greater than or equal to those in the second operand. 656 /// 657 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 658 /// If either value in a comparison is NaN, returns true. 659 /// 660 /// \headerfile <x86intrin.h> 661 /// 662 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 663 /// 664 /// \param __a 665 /// A 128-bit vector of [2 x double]. 666 /// \param __b 667 /// A 128-bit vector of [2 x double]. 668 /// \returns A 128-bit vector containing the comparison results. 669 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, 670 __m128d __b) { 671 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 672 } 673 674 /// Compares the lower double-precision floating-point values in each of 675 /// the two 128-bit floating-point vectors of [2 x double] for equality. 676 /// 677 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 678 /// If either value in a comparison is NaN, returns false. 679 /// 680 /// \headerfile <x86intrin.h> 681 /// 682 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 683 /// 684 /// \param __a 685 /// A 128-bit vector of [2 x double]. The lower double-precision value is 686 /// compared to the lower double-precision value of \a __b. 687 /// \param __b 688 /// A 128-bit vector of [2 x double]. The lower double-precision value is 689 /// compared to the lower double-precision value of \a __a. 690 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 691 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 692 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, 693 __m128d __b) { 694 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 695 } 696 697 /// Compares the lower double-precision floating-point values in each of 698 /// the two 128-bit floating-point vectors of [2 x double] to determine if 699 /// the value in the first parameter is less than the corresponding value in 700 /// the second parameter. 701 /// 702 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 703 /// If either value in a comparison is NaN, returns false. 704 /// 705 /// \headerfile <x86intrin.h> 706 /// 707 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 708 /// 709 /// \param __a 710 /// A 128-bit vector of [2 x double]. The lower double-precision value is 711 /// compared to the lower double-precision value of \a __b. 712 /// \param __b 713 /// A 128-bit vector of [2 x double]. The lower double-precision value is 714 /// compared to the lower double-precision value of \a __a. 715 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 716 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 717 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, 718 __m128d __b) { 719 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 720 } 721 722 /// Compares the lower double-precision floating-point values in each of 723 /// the two 128-bit floating-point vectors of [2 x double] to determine if 724 /// the value in the first parameter is less than or equal to the 725 /// corresponding value in the second parameter. 726 /// 727 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 728 /// If either value in a comparison is NaN, returns false. 729 /// 730 /// \headerfile <x86intrin.h> 731 /// 732 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 733 /// 734 /// \param __a 735 /// A 128-bit vector of [2 x double]. The lower double-precision value is 736 /// compared to the lower double-precision value of \a __b. 737 /// \param __b 738 /// A 128-bit vector of [2 x double]. The lower double-precision value is 739 /// compared to the lower double-precision value of \a __a. 740 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 741 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 742 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, 743 __m128d __b) { 744 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 745 } 746 747 /// Compares the lower double-precision floating-point values in each of 748 /// the two 128-bit floating-point vectors of [2 x double] to determine if 749 /// the value in the first parameter is greater than the corresponding value 750 /// in the second parameter. 751 /// 752 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 753 /// If either value in a comparison is NaN, returns false. 754 /// 755 /// \headerfile <x86intrin.h> 756 /// 757 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 758 /// 759 /// \param __a 760 /// A 128-bit vector of [2 x double]. The lower double-precision value is 761 /// compared to the lower double-precision value of \a __b. 762 /// \param __b 763 /// A 128-bit vector of [2 x double]. The lower double-precision value is 764 /// compared to the lower double-precision value of \a __a. 765 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 766 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 767 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, 768 __m128d __b) { 769 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 770 return __extension__(__m128d){__c[0], __a[1]}; 771 } 772 773 /// Compares the lower double-precision floating-point values in each of 774 /// the two 128-bit floating-point vectors of [2 x double] to determine if 775 /// the value in the first parameter is greater than or equal to the 776 /// corresponding value in the second parameter. 777 /// 778 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 779 /// If either value in a comparison is NaN, returns false. 780 /// 781 /// \headerfile <x86intrin.h> 782 /// 783 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 784 /// 785 /// \param __a 786 /// A 128-bit vector of [2 x double]. The lower double-precision value is 787 /// compared to the lower double-precision value of \a __b. 788 /// \param __b 789 /// A 128-bit vector of [2 x double]. The lower double-precision value is 790 /// compared to the lower double-precision value of \a __a. 791 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 792 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 793 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, 794 __m128d __b) { 795 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 796 return __extension__(__m128d){__c[0], __a[1]}; 797 } 798 799 /// Compares the lower double-precision floating-point values in each of 800 /// the two 128-bit floating-point vectors of [2 x double] to determine if 801 /// the value in the first parameter is ordered with respect to the 802 /// corresponding value in the second parameter. 803 /// 804 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 805 /// of double-precision values are ordered with respect to each other if 806 /// neither value is a NaN. 807 /// 808 /// \headerfile <x86intrin.h> 809 /// 810 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 811 /// 812 /// \param __a 813 /// A 128-bit vector of [2 x double]. The lower double-precision value is 814 /// compared to the lower double-precision value of \a __b. 815 /// \param __b 816 /// A 128-bit vector of [2 x double]. The lower double-precision value is 817 /// compared to the lower double-precision value of \a __a. 818 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 819 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 820 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, 821 __m128d __b) { 822 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 823 } 824 825 /// Compares the lower double-precision floating-point values in each of 826 /// the two 128-bit floating-point vectors of [2 x double] to determine if 827 /// the value in the first parameter is unordered with respect to the 828 /// corresponding value in the second parameter. 829 /// 830 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 831 /// of double-precision values are unordered with respect to each other if 832 /// one or both values are NaN. 833 /// 834 /// \headerfile <x86intrin.h> 835 /// 836 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 837 /// instruction. 838 /// 839 /// \param __a 840 /// A 128-bit vector of [2 x double]. The lower double-precision value is 841 /// compared to the lower double-precision value of \a __b. 842 /// \param __b 843 /// A 128-bit vector of [2 x double]. The lower double-precision value is 844 /// compared to the lower double-precision value of \a __a. 845 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 846 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 847 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, 848 __m128d __b) { 849 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 850 } 851 852 /// Compares the lower double-precision floating-point values in each of 853 /// the two 128-bit floating-point vectors of [2 x double] to determine if 854 /// the value in the first parameter is unequal to the corresponding value in 855 /// the second parameter. 856 /// 857 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 858 /// If either value in a comparison is NaN, returns true. 859 /// 860 /// \headerfile <x86intrin.h> 861 /// 862 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 863 /// 864 /// \param __a 865 /// A 128-bit vector of [2 x double]. The lower double-precision value is 866 /// compared to the lower double-precision value of \a __b. 867 /// \param __b 868 /// A 128-bit vector of [2 x double]. The lower double-precision value is 869 /// compared to the lower double-precision value of \a __a. 870 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 871 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 872 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, 873 __m128d __b) { 874 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 875 } 876 877 /// Compares the lower double-precision floating-point values in each of 878 /// the two 128-bit floating-point vectors of [2 x double] to determine if 879 /// the value in the first parameter is not less than the corresponding 880 /// value in the second parameter. 881 /// 882 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 883 /// If either value in a comparison is NaN, returns true. 884 /// 885 /// \headerfile <x86intrin.h> 886 /// 887 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 888 /// 889 /// \param __a 890 /// A 128-bit vector of [2 x double]. The lower double-precision value is 891 /// compared to the lower double-precision value of \a __b. 892 /// \param __b 893 /// A 128-bit vector of [2 x double]. The lower double-precision value is 894 /// compared to the lower double-precision value of \a __a. 895 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 896 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 897 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, 898 __m128d __b) { 899 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 900 } 901 902 /// Compares the lower double-precision floating-point values in each of 903 /// the two 128-bit floating-point vectors of [2 x double] to determine if 904 /// the value in the first parameter is not less than or equal to the 905 /// corresponding value in the second parameter. 906 /// 907 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 908 /// If either value in a comparison is NaN, returns true. 909 /// 910 /// \headerfile <x86intrin.h> 911 /// 912 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 913 /// 914 /// \param __a 915 /// A 128-bit vector of [2 x double]. The lower double-precision value is 916 /// compared to the lower double-precision value of \a __b. 917 /// \param __b 918 /// A 128-bit vector of [2 x double]. The lower double-precision value is 919 /// compared to the lower double-precision value of \a __a. 920 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 921 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 922 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, 923 __m128d __b) { 924 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 925 } 926 927 /// Compares the lower double-precision floating-point values in each of 928 /// the two 128-bit floating-point vectors of [2 x double] to determine if 929 /// the value in the first parameter is not greater than the corresponding 930 /// value in the second parameter. 931 /// 932 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 933 /// If either value in a comparison is NaN, returns true. 934 /// 935 /// \headerfile <x86intrin.h> 936 /// 937 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 938 /// 939 /// \param __a 940 /// A 128-bit vector of [2 x double]. The lower double-precision value is 941 /// compared to the lower double-precision value of \a __b. 942 /// \param __b 943 /// A 128-bit vector of [2 x double]. The lower double-precision value is 944 /// compared to the lower double-precision value of \a __a. 945 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 946 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 947 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, 948 __m128d __b) { 949 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 950 return __extension__(__m128d){__c[0], __a[1]}; 951 } 952 953 /// Compares the lower double-precision floating-point values in each of 954 /// the two 128-bit floating-point vectors of [2 x double] to determine if 955 /// the value in the first parameter is not greater than or equal to the 956 /// corresponding value in the second parameter. 957 /// 958 /// The comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 959 /// If either value in a comparison is NaN, returns true. 960 /// 961 /// \headerfile <x86intrin.h> 962 /// 963 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 964 /// 965 /// \param __a 966 /// A 128-bit vector of [2 x double]. The lower double-precision value is 967 /// compared to the lower double-precision value of \a __b. 968 /// \param __b 969 /// A 128-bit vector of [2 x double]. The lower double-precision value is 970 /// compared to the lower double-precision value of \a __a. 971 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 972 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 973 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, 974 __m128d __b) { 975 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 976 return __extension__(__m128d){__c[0], __a[1]}; 977 } 978 979 /// Compares the lower double-precision floating-point values in each of 980 /// the two 128-bit floating-point vectors of [2 x double] for equality. 981 /// 982 /// The comparison returns 0 for false, 1 for true. If either value in a 983 /// comparison is NaN, returns 0. 984 /// 985 /// \headerfile <x86intrin.h> 986 /// 987 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 988 /// 989 /// \param __a 990 /// A 128-bit vector of [2 x double]. The lower double-precision value is 991 /// compared to the lower double-precision value of \a __b. 992 /// \param __b 993 /// A 128-bit vector of [2 x double]. The lower double-precision value is 994 /// compared to the lower double-precision value of \a __a. 995 /// \returns An integer containing the comparison results. 996 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, 997 __m128d __b) { 998 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 999 } 1000 1001 /// Compares the lower double-precision floating-point values in each of 1002 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1003 /// the value in the first parameter is less than the corresponding value in 1004 /// the second parameter. 1005 /// 1006 /// The comparison returns 0 for false, 1 for true. If either value in a 1007 /// comparison is NaN, returns 0. 1008 /// 1009 /// \headerfile <x86intrin.h> 1010 /// 1011 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1012 /// 1013 /// \param __a 1014 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1015 /// compared to the lower double-precision value of \a __b. 1016 /// \param __b 1017 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1018 /// compared to the lower double-precision value of \a __a. 1019 /// \returns An integer containing the comparison results. 1020 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, 1021 __m128d __b) { 1022 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 1023 } 1024 1025 /// Compares the lower double-precision floating-point values in each of 1026 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1027 /// the value in the first parameter is less than or equal to the 1028 /// corresponding value in the second parameter. 1029 /// 1030 /// The comparison returns 0 for false, 1 for true. If either value in a 1031 /// comparison is NaN, returns 0. 1032 /// 1033 /// \headerfile <x86intrin.h> 1034 /// 1035 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1036 /// 1037 /// \param __a 1038 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1039 /// compared to the lower double-precision value of \a __b. 1040 /// \param __b 1041 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1042 /// compared to the lower double-precision value of \a __a. 1043 /// \returns An integer containing the comparison results. 1044 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, 1045 __m128d __b) { 1046 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1047 } 1048 1049 /// Compares the lower double-precision floating-point values in each of 1050 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1051 /// the value in the first parameter is greater than the corresponding value 1052 /// in the second parameter. 1053 /// 1054 /// The comparison returns 0 for false, 1 for true. If either value in a 1055 /// comparison is NaN, returns 0. 1056 /// 1057 /// \headerfile <x86intrin.h> 1058 /// 1059 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1060 /// 1061 /// \param __a 1062 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1063 /// compared to the lower double-precision value of \a __b. 1064 /// \param __b 1065 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1066 /// compared to the lower double-precision value of \a __a. 1067 /// \returns An integer containing the comparison results. 1068 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, 1069 __m128d __b) { 1070 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1071 } 1072 1073 /// Compares the lower double-precision floating-point values in each of 1074 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1075 /// the value in the first parameter is greater than or equal to the 1076 /// corresponding value in the second parameter. 1077 /// 1078 /// The comparison returns 0 for false, 1 for true. If either value in a 1079 /// comparison is NaN, returns 0. 1080 /// 1081 /// \headerfile <x86intrin.h> 1082 /// 1083 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1084 /// 1085 /// \param __a 1086 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1087 /// compared to the lower double-precision value of \a __b. 1088 /// \param __b 1089 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1090 /// compared to the lower double-precision value of \a __a. 1091 /// \returns An integer containing the comparison results. 1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, 1093 __m128d __b) { 1094 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1095 } 1096 1097 /// Compares the lower double-precision floating-point values in each of 1098 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1099 /// the value in the first parameter is unequal to the corresponding value in 1100 /// the second parameter. 1101 /// 1102 /// The comparison returns 0 for false, 1 for true. If either value in a 1103 /// comparison is NaN, returns 1. 1104 /// 1105 /// \headerfile <x86intrin.h> 1106 /// 1107 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1108 /// 1109 /// \param __a 1110 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1111 /// compared to the lower double-precision value of \a __b. 1112 /// \param __b 1113 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1114 /// compared to the lower double-precision value of \a __a. 1115 /// \returns An integer containing the comparison results. 1116 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, 1117 __m128d __b) { 1118 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1119 } 1120 1121 /// Compares the lower double-precision floating-point values in each of 1122 /// the two 128-bit floating-point vectors of [2 x double] for equality. 1123 /// 1124 /// The comparison returns 0 for false, 1 for true. If either value in a 1125 /// comparison is NaN, returns 0. 1126 /// 1127 /// \headerfile <x86intrin.h> 1128 /// 1129 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1130 /// 1131 /// \param __a 1132 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1133 /// compared to the lower double-precision value of \a __b. 1134 /// \param __b 1135 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1136 /// compared to the lower double-precision value of \a __a. 1137 /// \returns An integer containing the comparison results. 1138 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, 1139 __m128d __b) { 1140 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1141 } 1142 1143 /// Compares the lower double-precision floating-point values in each of 1144 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1145 /// the value in the first parameter is less than the corresponding value in 1146 /// the second parameter. 1147 /// 1148 /// The comparison returns 0 for false, 1 for true. If either value in a 1149 /// comparison is NaN, returns 0. 1150 /// 1151 /// \headerfile <x86intrin.h> 1152 /// 1153 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1154 /// 1155 /// \param __a 1156 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1157 /// compared to the lower double-precision value of \a __b. 1158 /// \param __b 1159 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1160 /// compared to the lower double-precision value of \a __a. 1161 /// \returns An integer containing the comparison results. 1162 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, 1163 __m128d __b) { 1164 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1165 } 1166 1167 /// Compares the lower double-precision floating-point values in each of 1168 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1169 /// the value in the first parameter is less than or equal to the 1170 /// corresponding value in the second parameter. 1171 /// 1172 /// The comparison returns 0 for false, 1 for true. If either value in a 1173 /// comparison is NaN, returns 0. 1174 /// 1175 /// \headerfile <x86intrin.h> 1176 /// 1177 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1178 /// 1179 /// \param __a 1180 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1181 /// compared to the lower double-precision value of \a __b. 1182 /// \param __b 1183 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1184 /// compared to the lower double-precision value of \a __a. 1185 /// \returns An integer containing the comparison results. 1186 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, 1187 __m128d __b) { 1188 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1189 } 1190 1191 /// Compares the lower double-precision floating-point values in each of 1192 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1193 /// the value in the first parameter is greater than the corresponding value 1194 /// in the second parameter. 1195 /// 1196 /// The comparison returns 0 for false, 1 for true. If either value in a 1197 /// comparison is NaN, returns 0. 1198 /// 1199 /// \headerfile <x86intrin.h> 1200 /// 1201 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1202 /// 1203 /// \param __a 1204 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1205 /// compared to the lower double-precision value of \a __b. 1206 /// \param __b 1207 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1208 /// compared to the lower double-precision value of \a __a. 1209 /// \returns An integer containing the comparison results. 1210 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, 1211 __m128d __b) { 1212 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1213 } 1214 1215 /// Compares the lower double-precision floating-point values in each of 1216 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1217 /// the value in the first parameter is greater than or equal to the 1218 /// corresponding value in the second parameter. 1219 /// 1220 /// The comparison returns 0 for false, 1 for true. If either value in a 1221 /// comparison is NaN, returns 0. 1222 /// 1223 /// \headerfile <x86intrin.h> 1224 /// 1225 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1226 /// 1227 /// \param __a 1228 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1229 /// compared to the lower double-precision value of \a __b. 1230 /// \param __b 1231 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1232 /// compared to the lower double-precision value of \a __a. 1233 /// \returns An integer containing the comparison results. 1234 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, 1235 __m128d __b) { 1236 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1237 } 1238 1239 /// Compares the lower double-precision floating-point values in each of 1240 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1241 /// the value in the first parameter is unequal to the corresponding value in 1242 /// the second parameter. 1243 /// 1244 /// The comparison returns 0 for false, 1 for true. If either value in a 1245 /// comparison is NaN, returns 1. 1246 /// 1247 /// \headerfile <x86intrin.h> 1248 /// 1249 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1250 /// 1251 /// \param __a 1252 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1253 /// compared to the lower double-precision value of \a __b. 1254 /// \param __b 1255 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1256 /// compared to the lower double-precision value of \a __a. 1257 /// \returns An integer containing the comparison result. 1258 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, 1259 __m128d __b) { 1260 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1261 } 1262 1263 /// Converts the two double-precision floating-point elements of a 1264 /// 128-bit vector of [2 x double] into two single-precision floating-point 1265 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1266 /// The upper 64 bits of the result vector are set to zero. 1267 /// 1268 /// \headerfile <x86intrin.h> 1269 /// 1270 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1271 /// 1272 /// \param __a 1273 /// A 128-bit vector of [2 x double]. 1274 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1275 /// converted values. The upper 64 bits are set to zero. 1276 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { 1277 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1278 } 1279 1280 /// Converts the lower two single-precision floating-point elements of a 1281 /// 128-bit vector of [4 x float] into two double-precision floating-point 1282 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1283 /// elements of the input vector are unused. 1284 /// 1285 /// \headerfile <x86intrin.h> 1286 /// 1287 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1288 /// 1289 /// \param __a 1290 /// A 128-bit vector of [4 x float]. The lower two single-precision 1291 /// floating-point elements are converted to double-precision values. The 1292 /// upper two elements are unused. 1293 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { 1295 return (__m128d) __builtin_convertvector( 1296 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1297 } 1298 1299 /// Converts the lower two integer elements of a 128-bit vector of 1300 /// [4 x i32] into two double-precision floating-point values, returned in a 1301 /// 128-bit vector of [2 x double]. 1302 /// 1303 /// The upper two elements of the input vector are unused. 1304 /// 1305 /// \headerfile <x86intrin.h> 1306 /// 1307 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1308 /// 1309 /// \param __a 1310 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1311 /// converted to double-precision values. 1312 /// 1313 /// The upper two elements are unused. 1314 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1315 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { 1316 return (__m128d) __builtin_convertvector( 1317 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1318 } 1319 1320 /// Converts the two double-precision floating-point elements of a 1321 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1322 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1323 /// 64 bits of the result vector are set to zero. 1324 /// 1325 /// If a converted value does not fit in a 32-bit integer, raises a 1326 /// floating-point invalid exception. If the exception is masked, returns 1327 /// the most negative integer. 1328 /// 1329 /// \headerfile <x86intrin.h> 1330 /// 1331 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1332 /// 1333 /// \param __a 1334 /// A 128-bit vector of [2 x double]. 1335 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1336 /// converted values. The upper 64 bits are set to zero. 1337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { 1338 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1339 } 1340 1341 /// Converts the low-order element of a 128-bit vector of [2 x double] 1342 /// into a 32-bit signed integer value. 1343 /// 1344 /// If the converted value does not fit in a 32-bit integer, raises a 1345 /// floating-point invalid exception. If the exception is masked, returns 1346 /// the most negative integer. 1347 /// 1348 /// \headerfile <x86intrin.h> 1349 /// 1350 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1351 /// 1352 /// \param __a 1353 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1354 /// conversion. 1355 /// \returns A 32-bit signed integer containing the converted value. 1356 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { 1357 return __builtin_ia32_cvtsd2si((__v2df)__a); 1358 } 1359 1360 /// Converts the lower double-precision floating-point element of a 1361 /// 128-bit vector of [2 x double], in the second parameter, into a 1362 /// single-precision floating-point value, returned in the lower 32 bits of a 1363 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1364 /// copied from the upper 96 bits of the first parameter. 1365 /// 1366 /// \headerfile <x86intrin.h> 1367 /// 1368 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1369 /// 1370 /// \param __a 1371 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1372 /// copied to the upper 96 bits of the result. 1373 /// \param __b 1374 /// A 128-bit vector of [2 x double]. The lower double-precision 1375 /// floating-point element is used in the conversion. 1376 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1377 /// converted value from the second parameter. The upper 96 bits are copied 1378 /// from the upper 96 bits of the first parameter. 1379 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, 1380 __m128d __b) { 1381 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1382 } 1383 1384 /// Converts a 32-bit signed integer value, in the second parameter, into 1385 /// a double-precision floating-point value, returned in the lower 64 bits of 1386 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1387 /// are copied from the upper 64 bits of the first parameter. 1388 /// 1389 /// \headerfile <x86intrin.h> 1390 /// 1391 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1392 /// 1393 /// \param __a 1394 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1395 /// copied to the upper 64 bits of the result. 1396 /// \param __b 1397 /// A 32-bit signed integer containing the value to be converted. 1398 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1399 /// converted value from the second parameter. The upper 64 bits are copied 1400 /// from the upper 64 bits of the first parameter. 1401 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, 1402 int __b) { 1403 __a[0] = __b; 1404 return __a; 1405 } 1406 1407 /// Converts the lower single-precision floating-point element of a 1408 /// 128-bit vector of [4 x float], in the second parameter, into a 1409 /// double-precision floating-point value, returned in the lower 64 bits of 1410 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1411 /// are copied from the upper 64 bits of the first parameter. 1412 /// 1413 /// \headerfile <x86intrin.h> 1414 /// 1415 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1416 /// 1417 /// \param __a 1418 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1419 /// copied to the upper 64 bits of the result. 1420 /// \param __b 1421 /// A 128-bit vector of [4 x float]. The lower single-precision 1422 /// floating-point element is used in the conversion. 1423 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1424 /// converted value from the second parameter. The upper 64 bits are copied 1425 /// from the upper 64 bits of the first parameter. 1426 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, 1427 __m128 __b) { 1428 __a[0] = __b[0]; 1429 return __a; 1430 } 1431 1432 /// Converts the two double-precision floating-point elements of a 1433 /// 128-bit vector of [2 x double] into two signed truncated (rounded 1434 /// toward zero) 32-bit integer values, returned in the lower 64 bits 1435 /// of a 128-bit vector of [4 x i32]. 1436 /// 1437 /// If a converted value does not fit in a 32-bit integer, raises a 1438 /// floating-point invalid exception. If the exception is masked, returns 1439 /// the most negative integer. 1440 /// 1441 /// \headerfile <x86intrin.h> 1442 /// 1443 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1444 /// instruction. 1445 /// 1446 /// \param __a 1447 /// A 128-bit vector of [2 x double]. 1448 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1449 /// converted values. The upper 64 bits are set to zero. 1450 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) { 1451 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1452 } 1453 1454 /// Converts the low-order element of a [2 x double] vector into a 32-bit 1455 /// signed truncated (rounded toward zero) integer value. 1456 /// 1457 /// If the converted value does not fit in a 32-bit integer, raises a 1458 /// floating-point invalid exception. If the exception is masked, returns 1459 /// the most negative integer. 1460 /// 1461 /// \headerfile <x86intrin.h> 1462 /// 1463 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1464 /// instruction. 1465 /// 1466 /// \param __a 1467 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1468 /// conversion. 1469 /// \returns A 32-bit signed integer containing the converted value. 1470 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { 1471 return __builtin_ia32_cvttsd2si((__v2df)__a); 1472 } 1473 1474 /// Converts the two double-precision floating-point elements of a 1475 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1476 /// returned in a 64-bit vector of [2 x i32]. 1477 /// 1478 /// If a converted value does not fit in a 32-bit integer, raises a 1479 /// floating-point invalid exception. If the exception is masked, returns 1480 /// the most negative integer. 1481 /// 1482 /// \headerfile <x86intrin.h> 1483 /// 1484 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1485 /// 1486 /// \param __a 1487 /// A 128-bit vector of [2 x double]. 1488 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { 1490 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1491 } 1492 1493 /// Converts the two double-precision floating-point elements of a 1494 /// 128-bit vector of [2 x double] into two signed truncated (rounded toward 1495 /// zero) 32-bit integer values, returned in a 64-bit vector of [2 x i32]. 1496 /// 1497 /// If a converted value does not fit in a 32-bit integer, raises a 1498 /// floating-point invalid exception. If the exception is masked, returns 1499 /// the most negative integer. 1500 /// 1501 /// \headerfile <x86intrin.h> 1502 /// 1503 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1504 /// 1505 /// \param __a 1506 /// A 128-bit vector of [2 x double]. 1507 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1508 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { 1509 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1510 } 1511 1512 /// Converts the two signed 32-bit integer elements of a 64-bit vector of 1513 /// [2 x i32] into two double-precision floating-point values, returned in a 1514 /// 128-bit vector of [2 x double]. 1515 /// 1516 /// \headerfile <x86intrin.h> 1517 /// 1518 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1519 /// 1520 /// \param __a 1521 /// A 64-bit vector of [2 x i32]. 1522 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1523 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { 1524 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1525 } 1526 1527 /// Returns the low-order element of a 128-bit vector of [2 x double] as 1528 /// a double-precision floating-point value. 1529 /// 1530 /// \headerfile <x86intrin.h> 1531 /// 1532 /// This intrinsic has no corresponding instruction. 1533 /// 1534 /// \param __a 1535 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1536 /// \returns A double-precision floating-point value copied from the lower 64 1537 /// bits of \a __a. 1538 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) { 1539 return __a[0]; 1540 } 1541 1542 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1543 /// memory location. 1544 /// 1545 /// \headerfile <x86intrin.h> 1546 /// 1547 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1548 /// 1549 /// \param __dp 1550 /// A pointer to a 128-bit memory location. The address of the memory 1551 /// location has to be 16-byte aligned. 1552 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1553 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) { 1554 return *(const __m128d *)__dp; 1555 } 1556 1557 /// Loads a double-precision floating-point value from a specified memory 1558 /// location and duplicates it to both vector elements of a 128-bit vector of 1559 /// [2 x double]. 1560 /// 1561 /// \headerfile <x86intrin.h> 1562 /// 1563 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1564 /// 1565 /// \param __dp 1566 /// A pointer to a memory location containing a double-precision value. 1567 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1568 /// duplicated values. 1569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) { 1570 struct __mm_load1_pd_struct { 1571 double __u; 1572 } __attribute__((__packed__, __may_alias__)); 1573 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u; 1574 return __extension__(__m128d){__u, __u}; 1575 } 1576 1577 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1578 1579 /// Loads two double-precision values, in reverse order, from an aligned 1580 /// memory location into a 128-bit vector of [2 x double]. 1581 /// 1582 /// \headerfile <x86intrin.h> 1583 /// 1584 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1585 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1586 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1587 /// 1588 /// \param __dp 1589 /// A 16-byte aligned pointer to an array of double-precision values to be 1590 /// loaded in reverse order. 1591 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1592 /// values. 1593 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { 1594 __m128d __u = *(const __m128d *)__dp; 1595 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1596 } 1597 1598 /// Loads a 128-bit floating-point vector of [2 x double] from an 1599 /// unaligned memory location. 1600 /// 1601 /// \headerfile <x86intrin.h> 1602 /// 1603 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1604 /// 1605 /// \param __dp 1606 /// A pointer to a 128-bit memory location. The address of the memory 1607 /// location does not have to be aligned. 1608 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { 1610 struct __loadu_pd { 1611 __m128d_u __v; 1612 } __attribute__((__packed__, __may_alias__)); 1613 return ((const struct __loadu_pd *)__dp)->__v; 1614 } 1615 1616 /// Loads a 64-bit integer value to the low element of a 128-bit integer 1617 /// vector and clears the upper element. 1618 /// 1619 /// \headerfile <x86intrin.h> 1620 /// 1621 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1622 /// 1623 /// \param __a 1624 /// A pointer to a 64-bit memory location. The address of the memory 1625 /// location does not have to be aligned. 1626 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1627 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) { 1628 struct __loadu_si64 { 1629 long long __v; 1630 } __attribute__((__packed__, __may_alias__)); 1631 long long __u = ((const struct __loadu_si64 *)__a)->__v; 1632 return __extension__(__m128i)(__v2di){__u, 0LL}; 1633 } 1634 1635 /// Loads a 32-bit integer value to the low element of a 128-bit integer 1636 /// vector and clears the upper element. 1637 /// 1638 /// \headerfile <x86intrin.h> 1639 /// 1640 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1641 /// 1642 /// \param __a 1643 /// A pointer to a 32-bit memory location. The address of the memory 1644 /// location does not have to be aligned. 1645 /// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) { 1647 struct __loadu_si32 { 1648 int __v; 1649 } __attribute__((__packed__, __may_alias__)); 1650 int __u = ((const struct __loadu_si32 *)__a)->__v; 1651 return __extension__(__m128i)(__v4si){__u, 0, 0, 0}; 1652 } 1653 1654 /// Loads a 16-bit integer value to the low element of a 128-bit integer 1655 /// vector and clears the upper element. 1656 /// 1657 /// \headerfile <x86intrin.h> 1658 /// 1659 /// This intrinsic does not correspond to a specific instruction. 1660 /// 1661 /// \param __a 1662 /// A pointer to a 16-bit memory location. The address of the memory 1663 /// location does not have to be aligned. 1664 /// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1665 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) { 1666 struct __loadu_si16 { 1667 short __v; 1668 } __attribute__((__packed__, __may_alias__)); 1669 short __u = ((const struct __loadu_si16 *)__a)->__v; 1670 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1671 } 1672 1673 /// Loads a 64-bit double-precision value to the low element of a 1674 /// 128-bit integer vector and clears the upper element. 1675 /// 1676 /// \headerfile <x86intrin.h> 1677 /// 1678 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1679 /// 1680 /// \param __dp 1681 /// A pointer to a memory location containing a double-precision value. 1682 /// The address of the memory location does not have to be aligned. 1683 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1684 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) { 1685 struct __mm_load_sd_struct { 1686 double __u; 1687 } __attribute__((__packed__, __may_alias__)); 1688 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u; 1689 return __extension__(__m128d){__u, 0}; 1690 } 1691 1692 /// Loads a double-precision value into the high-order bits of a 128-bit 1693 /// vector of [2 x double]. The low-order bits are copied from the low-order 1694 /// bits of the first operand. 1695 /// 1696 /// \headerfile <x86intrin.h> 1697 /// 1698 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1699 /// 1700 /// \param __a 1701 /// A 128-bit vector of [2 x double]. \n 1702 /// Bits [63:0] are written to bits [63:0] of the result. 1703 /// \param __dp 1704 /// A pointer to a 64-bit memory location containing a double-precision 1705 /// floating-point value that is loaded. The loaded value is written to bits 1706 /// [127:64] of the result. The address of the memory location does not have 1707 /// to be aligned. 1708 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, 1710 double const *__dp) { 1711 struct __mm_loadh_pd_struct { 1712 double __u; 1713 } __attribute__((__packed__, __may_alias__)); 1714 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u; 1715 return __extension__(__m128d){__a[0], __u}; 1716 } 1717 1718 /// Loads a double-precision value into the low-order bits of a 128-bit 1719 /// vector of [2 x double]. The high-order bits are copied from the 1720 /// high-order bits of the first operand. 1721 /// 1722 /// \headerfile <x86intrin.h> 1723 /// 1724 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1725 /// 1726 /// \param __a 1727 /// A 128-bit vector of [2 x double]. \n 1728 /// Bits [127:64] are written to bits [127:64] of the result. 1729 /// \param __dp 1730 /// A pointer to a 64-bit memory location containing a double-precision 1731 /// floating-point value that is loaded. The loaded value is written to bits 1732 /// [63:0] of the result. The address of the memory location does not have to 1733 /// be aligned. 1734 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1735 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, 1736 double const *__dp) { 1737 struct __mm_loadl_pd_struct { 1738 double __u; 1739 } __attribute__((__packed__, __may_alias__)); 1740 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u; 1741 return __extension__(__m128d){__u, __a[1]}; 1742 } 1743 1744 /// Constructs a 128-bit floating-point vector of [2 x double] with 1745 /// unspecified content. This could be used as an argument to another 1746 /// intrinsic function where the argument is required but the value is not 1747 /// actually used. 1748 /// 1749 /// \headerfile <x86intrin.h> 1750 /// 1751 /// This intrinsic has no corresponding instruction. 1752 /// 1753 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1754 /// content. 1755 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { 1756 return (__m128d)__builtin_ia32_undef128(); 1757 } 1758 1759 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1760 /// 64 bits of the vector are initialized with the specified double-precision 1761 /// floating-point value. The upper 64 bits are set to zero. 1762 /// 1763 /// \headerfile <x86intrin.h> 1764 /// 1765 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1766 /// 1767 /// \param __w 1768 /// A double-precision floating-point value used to initialize the lower 64 1769 /// bits of the result. 1770 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1771 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1772 /// set to zero. 1773 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { 1774 return __extension__(__m128d){__w, 0.0}; 1775 } 1776 1777 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1778 /// of the two double-precision floating-point vector elements set to the 1779 /// specified double-precision floating-point value. 1780 /// 1781 /// \headerfile <x86intrin.h> 1782 /// 1783 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1784 /// 1785 /// \param __w 1786 /// A double-precision floating-point value used to initialize each vector 1787 /// element of the result. 1788 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1789 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { 1790 return __extension__(__m128d){__w, __w}; 1791 } 1792 1793 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1794 /// of the two double-precision floating-point vector elements set to the 1795 /// specified double-precision floating-point value. 1796 /// 1797 /// \headerfile <x86intrin.h> 1798 /// 1799 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1800 /// 1801 /// \param __w 1802 /// A double-precision floating-point value used to initialize each vector 1803 /// element of the result. 1804 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1805 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) { 1806 return _mm_set1_pd(__w); 1807 } 1808 1809 /// Constructs a 128-bit floating-point vector of [2 x double] 1810 /// initialized with the specified double-precision floating-point values. 1811 /// 1812 /// \headerfile <x86intrin.h> 1813 /// 1814 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1815 /// 1816 /// \param __w 1817 /// A double-precision floating-point value used to initialize the upper 64 1818 /// bits of the result. 1819 /// \param __x 1820 /// A double-precision floating-point value used to initialize the lower 64 1821 /// bits of the result. 1822 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1823 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, 1824 double __x) { 1825 return __extension__(__m128d){__x, __w}; 1826 } 1827 1828 /// Constructs a 128-bit floating-point vector of [2 x double], 1829 /// initialized in reverse order with the specified double-precision 1830 /// floating-point values. 1831 /// 1832 /// \headerfile <x86intrin.h> 1833 /// 1834 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1835 /// 1836 /// \param __w 1837 /// A double-precision floating-point value used to initialize the lower 64 1838 /// bits of the result. 1839 /// \param __x 1840 /// A double-precision floating-point value used to initialize the upper 64 1841 /// bits of the result. 1842 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1843 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, 1844 double __x) { 1845 return __extension__(__m128d){__w, __x}; 1846 } 1847 1848 /// Constructs a 128-bit floating-point vector of [2 x double] 1849 /// initialized to zero. 1850 /// 1851 /// \headerfile <x86intrin.h> 1852 /// 1853 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1854 /// 1855 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1856 /// all elements set to zero. 1857 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { 1858 return __extension__(__m128d){0.0, 0.0}; 1859 } 1860 1861 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1862 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1863 /// 64 bits are set to the upper 64 bits of the first parameter. 1864 /// 1865 /// \headerfile <x86intrin.h> 1866 /// 1867 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1868 /// 1869 /// \param __a 1870 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1871 /// upper 64 bits of the result. 1872 /// \param __b 1873 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1874 /// lower 64 bits of the result. 1875 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1876 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, 1877 __m128d __b) { 1878 __a[0] = __b[0]; 1879 return __a; 1880 } 1881 1882 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1883 /// memory location. 1884 /// 1885 /// \headerfile <x86intrin.h> 1886 /// 1887 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1888 /// 1889 /// \param __dp 1890 /// A pointer to a 64-bit memory location. 1891 /// \param __a 1892 /// A 128-bit vector of [2 x double] containing the value to be stored. 1893 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, 1894 __m128d __a) { 1895 struct __mm_store_sd_struct { 1896 double __u; 1897 } __attribute__((__packed__, __may_alias__)); 1898 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0]; 1899 } 1900 1901 /// Moves packed double-precision values from a 128-bit vector of 1902 /// [2 x double] to a memory location. 1903 /// 1904 /// \headerfile <x86intrin.h> 1905 /// 1906 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1907 /// 1908 /// \param __dp 1909 /// A pointer to an aligned memory location that can store two 1910 /// double-precision values. 1911 /// \param __a 1912 /// A packed 128-bit vector of [2 x double] containing the values to be 1913 /// moved. 1914 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, 1915 __m128d __a) { 1916 *(__m128d *)__dp = __a; 1917 } 1918 1919 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1920 /// the upper and lower 64 bits of a memory location. 1921 /// 1922 /// \headerfile <x86intrin.h> 1923 /// 1924 /// This intrinsic corresponds to the 1925 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1926 /// 1927 /// \param __dp 1928 /// A pointer to a memory location that can store two double-precision 1929 /// values. 1930 /// \param __a 1931 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1932 /// of the values in \a __dp. 1933 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, 1934 __m128d __a) { 1935 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1936 _mm_store_pd(__dp, __a); 1937 } 1938 1939 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1940 /// the upper and lower 64 bits of a memory location. 1941 /// 1942 /// \headerfile <x86intrin.h> 1943 /// 1944 /// This intrinsic corresponds to the 1945 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1946 /// 1947 /// \param __dp 1948 /// A pointer to a memory location that can store two double-precision 1949 /// values. 1950 /// \param __a 1951 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1952 /// of the values in \a __dp. 1953 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, 1954 __m128d __a) { 1955 _mm_store1_pd(__dp, __a); 1956 } 1957 1958 /// Stores a 128-bit vector of [2 x double] into an unaligned memory 1959 /// location. 1960 /// 1961 /// \headerfile <x86intrin.h> 1962 /// 1963 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1964 /// 1965 /// \param __dp 1966 /// A pointer to a 128-bit memory location. The address of the memory 1967 /// location does not have to be aligned. 1968 /// \param __a 1969 /// A 128-bit vector of [2 x double] containing the values to be stored. 1970 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, 1971 __m128d __a) { 1972 struct __storeu_pd { 1973 __m128d_u __v; 1974 } __attribute__((__packed__, __may_alias__)); 1975 ((struct __storeu_pd *)__dp)->__v = __a; 1976 } 1977 1978 /// Stores two double-precision values, in reverse order, from a 128-bit 1979 /// vector of [2 x double] to a 16-byte aligned memory location. 1980 /// 1981 /// \headerfile <x86intrin.h> 1982 /// 1983 /// This intrinsic corresponds to a shuffling instruction followed by a 1984 /// <c> VMOVAPD / MOVAPD </c> instruction. 1985 /// 1986 /// \param __dp 1987 /// A pointer to a 16-byte aligned memory location that can store two 1988 /// double-precision values. 1989 /// \param __a 1990 /// A 128-bit vector of [2 x double] containing the values to be reversed and 1991 /// stored. 1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, 1993 __m128d __a) { 1994 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 1995 *(__m128d *)__dp = __a; 1996 } 1997 1998 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 1999 /// memory location. 2000 /// 2001 /// \headerfile <x86intrin.h> 2002 /// 2003 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 2004 /// 2005 /// \param __dp 2006 /// A pointer to a 64-bit memory location. 2007 /// \param __a 2008 /// A 128-bit vector of [2 x double] containing the value to be stored. 2009 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, 2010 __m128d __a) { 2011 struct __mm_storeh_pd_struct { 2012 double __u; 2013 } __attribute__((__packed__, __may_alias__)); 2014 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1]; 2015 } 2016 2017 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 2018 /// memory location. 2019 /// 2020 /// \headerfile <x86intrin.h> 2021 /// 2022 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 2023 /// 2024 /// \param __dp 2025 /// A pointer to a 64-bit memory location. 2026 /// \param __a 2027 /// A 128-bit vector of [2 x double] containing the value to be stored. 2028 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, 2029 __m128d __a) { 2030 struct __mm_storeh_pd_struct { 2031 double __u; 2032 } __attribute__((__packed__, __may_alias__)); 2033 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0]; 2034 } 2035 2036 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2037 /// saving the lower 8 bits of each sum in the corresponding element of a 2038 /// 128-bit result vector of [16 x i8]. 2039 /// 2040 /// The integer elements of both parameters can be either signed or unsigned. 2041 /// 2042 /// \headerfile <x86intrin.h> 2043 /// 2044 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2045 /// 2046 /// \param __a 2047 /// A 128-bit vector of [16 x i8]. 2048 /// \param __b 2049 /// A 128-bit vector of [16 x i8]. 2050 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2051 /// parameters. 2052 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, 2053 __m128i __b) { 2054 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2055 } 2056 2057 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2058 /// saving the lower 16 bits of each sum in the corresponding element of a 2059 /// 128-bit result vector of [8 x i16]. 2060 /// 2061 /// The integer elements of both parameters can be either signed or unsigned. 2062 /// 2063 /// \headerfile <x86intrin.h> 2064 /// 2065 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2066 /// 2067 /// \param __a 2068 /// A 128-bit vector of [8 x i16]. 2069 /// \param __b 2070 /// A 128-bit vector of [8 x i16]. 2071 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2072 /// parameters. 2073 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, 2074 __m128i __b) { 2075 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2076 } 2077 2078 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2079 /// saving the lower 32 bits of each sum in the corresponding element of a 2080 /// 128-bit result vector of [4 x i32]. 2081 /// 2082 /// The integer elements of both parameters can be either signed or unsigned. 2083 /// 2084 /// \headerfile <x86intrin.h> 2085 /// 2086 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2087 /// 2088 /// \param __a 2089 /// A 128-bit vector of [4 x i32]. 2090 /// \param __b 2091 /// A 128-bit vector of [4 x i32]. 2092 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2093 /// parameters. 2094 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, 2095 __m128i __b) { 2096 return (__m128i)((__v4su)__a + (__v4su)__b); 2097 } 2098 2099 /// Adds two signed or unsigned 64-bit integer values, returning the 2100 /// lower 64 bits of the sum. 2101 /// 2102 /// \headerfile <x86intrin.h> 2103 /// 2104 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2105 /// 2106 /// \param __a 2107 /// A 64-bit integer. 2108 /// \param __b 2109 /// A 64-bit integer. 2110 /// \returns A 64-bit integer containing the sum of both parameters. 2111 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, 2112 __m64 __b) { 2113 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2114 } 2115 2116 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2117 /// saving the lower 64 bits of each sum in the corresponding element of a 2118 /// 128-bit result vector of [2 x i64]. 2119 /// 2120 /// The integer elements of both parameters can be either signed or unsigned. 2121 /// 2122 /// \headerfile <x86intrin.h> 2123 /// 2124 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2125 /// 2126 /// \param __a 2127 /// A 128-bit vector of [2 x i64]. 2128 /// \param __b 2129 /// A 128-bit vector of [2 x i64]. 2130 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2131 /// parameters. 2132 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, 2133 __m128i __b) { 2134 return (__m128i)((__v2du)__a + (__v2du)__b); 2135 } 2136 2137 /// Adds, with saturation, the corresponding elements of two 128-bit 2138 /// signed [16 x i8] vectors, saving each sum in the corresponding element 2139 /// of a 128-bit result vector of [16 x i8]. 2140 /// 2141 /// Positive sums greater than 0x7F are saturated to 0x7F. Negative sums 2142 /// less than 0x80 are saturated to 0x80. 2143 /// 2144 /// \headerfile <x86intrin.h> 2145 /// 2146 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2147 /// 2148 /// \param __a 2149 /// A 128-bit signed [16 x i8] vector. 2150 /// \param __b 2151 /// A 128-bit signed [16 x i8] vector. 2152 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2153 /// both parameters. 2154 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, 2155 __m128i __b) { 2156 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); 2157 } 2158 2159 /// Adds, with saturation, the corresponding elements of two 128-bit 2160 /// signed [8 x i16] vectors, saving each sum in the corresponding element 2161 /// of a 128-bit result vector of [8 x i16]. 2162 /// 2163 /// Positive sums greater than 0x7FFF are saturated to 0x7FFF. Negative sums 2164 /// less than 0x8000 are saturated to 0x8000. 2165 /// 2166 /// \headerfile <x86intrin.h> 2167 /// 2168 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2169 /// 2170 /// \param __a 2171 /// A 128-bit signed [8 x i16] vector. 2172 /// \param __b 2173 /// A 128-bit signed [8 x i16] vector. 2174 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2175 /// both parameters. 2176 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, 2177 __m128i __b) { 2178 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); 2179 } 2180 2181 /// Adds, with saturation, the corresponding elements of two 128-bit 2182 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2183 /// of a 128-bit result vector of [16 x i8]. 2184 /// 2185 /// Positive sums greater than 0xFF are saturated to 0xFF. Negative sums are 2186 /// saturated to 0x00. 2187 /// 2188 /// \headerfile <x86intrin.h> 2189 /// 2190 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2191 /// 2192 /// \param __a 2193 /// A 128-bit unsigned [16 x i8] vector. 2194 /// \param __b 2195 /// A 128-bit unsigned [16 x i8] vector. 2196 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2197 /// of both parameters. 2198 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, 2199 __m128i __b) { 2200 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); 2201 } 2202 2203 /// Adds, with saturation, the corresponding elements of two 128-bit 2204 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2205 /// of a 128-bit result vector of [8 x i16]. 2206 /// 2207 /// Positive sums greater than 0xFFFF are saturated to 0xFFFF. Negative sums 2208 /// are saturated to 0x0000. 2209 /// 2210 /// \headerfile <x86intrin.h> 2211 /// 2212 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2213 /// 2214 /// \param __a 2215 /// A 128-bit unsigned [8 x i16] vector. 2216 /// \param __b 2217 /// A 128-bit unsigned [8 x i16] vector. 2218 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2219 /// of both parameters. 2220 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, 2221 __m128i __b) { 2222 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); 2223 } 2224 2225 /// Computes the rounded averages of corresponding elements of two 2226 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2227 /// corresponding element of a 128-bit result vector of [16 x i8]. 2228 /// 2229 /// \headerfile <x86intrin.h> 2230 /// 2231 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2232 /// 2233 /// \param __a 2234 /// A 128-bit unsigned [16 x i8] vector. 2235 /// \param __b 2236 /// A 128-bit unsigned [16 x i8] vector. 2237 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2238 /// averages of both parameters. 2239 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, 2240 __m128i __b) { 2241 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2242 } 2243 2244 /// Computes the rounded averages of corresponding elements of two 2245 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2246 /// corresponding element of a 128-bit result vector of [8 x i16]. 2247 /// 2248 /// \headerfile <x86intrin.h> 2249 /// 2250 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2251 /// 2252 /// \param __a 2253 /// A 128-bit unsigned [8 x i16] vector. 2254 /// \param __b 2255 /// A 128-bit unsigned [8 x i16] vector. 2256 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2257 /// averages of both parameters. 2258 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, 2259 __m128i __b) { 2260 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2261 } 2262 2263 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2264 /// vectors, producing eight intermediate 32-bit signed integer products, and 2265 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2266 /// [4 x i32] vector. 2267 /// 2268 /// For example, bits [15:0] of both parameters are multiplied producing a 2269 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2270 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2271 /// of the result. 2272 /// 2273 /// \headerfile <x86intrin.h> 2274 /// 2275 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2276 /// 2277 /// \param __a 2278 /// A 128-bit signed [8 x i16] vector. 2279 /// \param __b 2280 /// A 128-bit signed [8 x i16] vector. 2281 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2282 /// of both parameters. 2283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, 2284 __m128i __b) { 2285 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2286 } 2287 2288 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2289 /// vectors, saving the greater value from each comparison in the 2290 /// corresponding element of a 128-bit result vector of [8 x i16]. 2291 /// 2292 /// \headerfile <x86intrin.h> 2293 /// 2294 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2295 /// 2296 /// \param __a 2297 /// A 128-bit signed [8 x i16] vector. 2298 /// \param __b 2299 /// A 128-bit signed [8 x i16] vector. 2300 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2301 /// each comparison. 2302 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, 2303 __m128i __b) { 2304 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); 2305 } 2306 2307 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2308 /// vectors, saving the greater value from each comparison in the 2309 /// corresponding element of a 128-bit result vector of [16 x i8]. 2310 /// 2311 /// \headerfile <x86intrin.h> 2312 /// 2313 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2314 /// 2315 /// \param __a 2316 /// A 128-bit unsigned [16 x i8] vector. 2317 /// \param __b 2318 /// A 128-bit unsigned [16 x i8] vector. 2319 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2320 /// each comparison. 2321 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, 2322 __m128i __b) { 2323 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); 2324 } 2325 2326 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2327 /// vectors, saving the smaller value from each comparison in the 2328 /// corresponding element of a 128-bit result vector of [8 x i16]. 2329 /// 2330 /// \headerfile <x86intrin.h> 2331 /// 2332 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2333 /// 2334 /// \param __a 2335 /// A 128-bit signed [8 x i16] vector. 2336 /// \param __b 2337 /// A 128-bit signed [8 x i16] vector. 2338 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2339 /// each comparison. 2340 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, 2341 __m128i __b) { 2342 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); 2343 } 2344 2345 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2346 /// vectors, saving the smaller value from each comparison in the 2347 /// corresponding element of a 128-bit result vector of [16 x i8]. 2348 /// 2349 /// \headerfile <x86intrin.h> 2350 /// 2351 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2352 /// 2353 /// \param __a 2354 /// A 128-bit unsigned [16 x i8] vector. 2355 /// \param __b 2356 /// A 128-bit unsigned [16 x i8] vector. 2357 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2358 /// each comparison. 2359 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, 2360 __m128i __b) { 2361 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); 2362 } 2363 2364 /// Multiplies the corresponding elements of two signed [8 x i16] 2365 /// vectors, saving the upper 16 bits of each 32-bit product in the 2366 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2367 /// 2368 /// \headerfile <x86intrin.h> 2369 /// 2370 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2371 /// 2372 /// \param __a 2373 /// A 128-bit signed [8 x i16] vector. 2374 /// \param __b 2375 /// A 128-bit signed [8 x i16] vector. 2376 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2377 /// each of the eight 32-bit products. 2378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, 2379 __m128i __b) { 2380 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2381 } 2382 2383 /// Multiplies the corresponding elements of two unsigned [8 x i16] 2384 /// vectors, saving the upper 16 bits of each 32-bit product in the 2385 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2386 /// 2387 /// \headerfile <x86intrin.h> 2388 /// 2389 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2390 /// 2391 /// \param __a 2392 /// A 128-bit unsigned [8 x i16] vector. 2393 /// \param __b 2394 /// A 128-bit unsigned [8 x i16] vector. 2395 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2396 /// of each of the eight 32-bit products. 2397 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, 2398 __m128i __b) { 2399 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2400 } 2401 2402 /// Multiplies the corresponding elements of two signed [8 x i16] 2403 /// vectors, saving the lower 16 bits of each 32-bit product in the 2404 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2405 /// 2406 /// \headerfile <x86intrin.h> 2407 /// 2408 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2409 /// 2410 /// \param __a 2411 /// A 128-bit signed [8 x i16] vector. 2412 /// \param __b 2413 /// A 128-bit signed [8 x i16] vector. 2414 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2415 /// each of the eight 32-bit products. 2416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, 2417 __m128i __b) { 2418 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2419 } 2420 2421 /// Multiplies 32-bit unsigned integer values contained in the lower bits 2422 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2423 /// product. 2424 /// 2425 /// \headerfile <x86intrin.h> 2426 /// 2427 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2428 /// 2429 /// \param __a 2430 /// A 64-bit integer containing one of the source operands. 2431 /// \param __b 2432 /// A 64-bit integer containing one of the source operands. 2433 /// \returns A 64-bit integer vector containing the product of both operands. 2434 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, 2435 __m64 __b) { 2436 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2437 } 2438 2439 /// Multiplies 32-bit unsigned integer values contained in the lower 2440 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2441 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2442 /// 2443 /// \headerfile <x86intrin.h> 2444 /// 2445 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2446 /// 2447 /// \param __a 2448 /// A [2 x i64] vector containing one of the source operands. 2449 /// \param __b 2450 /// A [2 x i64] vector containing one of the source operands. 2451 /// \returns A [2 x i64] vector containing the product of both operands. 2452 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, 2453 __m128i __b) { 2454 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2455 } 2456 2457 /// Computes the absolute differences of corresponding 8-bit integer 2458 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2459 /// separately sums the second 8 absolute differences. Packs these two 2460 /// unsigned 16-bit integer sums into the upper and lower elements of a 2461 /// [2 x i64] vector. 2462 /// 2463 /// \headerfile <x86intrin.h> 2464 /// 2465 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2466 /// 2467 /// \param __a 2468 /// A 128-bit integer vector containing one of the source operands. 2469 /// \param __b 2470 /// A 128-bit integer vector containing one of the source operands. 2471 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2472 /// differences between both operands. 2473 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, 2474 __m128i __b) { 2475 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2476 } 2477 2478 /// Subtracts the corresponding 8-bit integer values in the operands. 2479 /// 2480 /// \headerfile <x86intrin.h> 2481 /// 2482 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2483 /// 2484 /// \param __a 2485 /// A 128-bit integer vector containing the minuends. 2486 /// \param __b 2487 /// A 128-bit integer vector containing the subtrahends. 2488 /// \returns A 128-bit integer vector containing the differences of the values 2489 /// in the operands. 2490 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, 2491 __m128i __b) { 2492 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2493 } 2494 2495 /// Subtracts the corresponding 16-bit integer values in the operands. 2496 /// 2497 /// \headerfile <x86intrin.h> 2498 /// 2499 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2500 /// 2501 /// \param __a 2502 /// A 128-bit integer vector containing the minuends. 2503 /// \param __b 2504 /// A 128-bit integer vector containing the subtrahends. 2505 /// \returns A 128-bit integer vector containing the differences of the values 2506 /// in the operands. 2507 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, 2508 __m128i __b) { 2509 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2510 } 2511 2512 /// Subtracts the corresponding 32-bit integer values in the operands. 2513 /// 2514 /// \headerfile <x86intrin.h> 2515 /// 2516 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2517 /// 2518 /// \param __a 2519 /// A 128-bit integer vector containing the minuends. 2520 /// \param __b 2521 /// A 128-bit integer vector containing the subtrahends. 2522 /// \returns A 128-bit integer vector containing the differences of the values 2523 /// in the operands. 2524 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, 2525 __m128i __b) { 2526 return (__m128i)((__v4su)__a - (__v4su)__b); 2527 } 2528 2529 /// Subtracts signed or unsigned 64-bit integer values and writes the 2530 /// difference to the corresponding bits in the destination. 2531 /// 2532 /// \headerfile <x86intrin.h> 2533 /// 2534 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2535 /// 2536 /// \param __a 2537 /// A 64-bit integer vector containing the minuend. 2538 /// \param __b 2539 /// A 64-bit integer vector containing the subtrahend. 2540 /// \returns A 64-bit integer vector containing the difference of the values in 2541 /// the operands. 2542 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, 2543 __m64 __b) { 2544 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2545 } 2546 2547 /// Subtracts the corresponding elements of two [2 x i64] vectors. 2548 /// 2549 /// \headerfile <x86intrin.h> 2550 /// 2551 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2552 /// 2553 /// \param __a 2554 /// A 128-bit integer vector containing the minuends. 2555 /// \param __b 2556 /// A 128-bit integer vector containing the subtrahends. 2557 /// \returns A 128-bit integer vector containing the differences of the values 2558 /// in the operands. 2559 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, 2560 __m128i __b) { 2561 return (__m128i)((__v2du)__a - (__v2du)__b); 2562 } 2563 2564 /// Subtracts, with saturation, corresponding 8-bit signed integer values in 2565 /// the input and returns the differences in the corresponding bytes in the 2566 /// destination. 2567 /// 2568 /// Differences greater than 0x7F are saturated to 0x7F, and differences 2569 /// less than 0x80 are saturated to 0x80. 2570 /// 2571 /// \headerfile <x86intrin.h> 2572 /// 2573 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2574 /// 2575 /// \param __a 2576 /// A 128-bit integer vector containing the minuends. 2577 /// \param __b 2578 /// A 128-bit integer vector containing the subtrahends. 2579 /// \returns A 128-bit integer vector containing the differences of the values 2580 /// in the operands. 2581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, 2582 __m128i __b) { 2583 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); 2584 } 2585 2586 /// Subtracts, with saturation, corresponding 16-bit signed integer values in 2587 /// the input and returns the differences in the corresponding bytes in the 2588 /// destination. 2589 /// 2590 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2591 /// than 0x8000 are saturated to 0x8000. 2592 /// 2593 /// \headerfile <x86intrin.h> 2594 /// 2595 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2596 /// 2597 /// \param __a 2598 /// A 128-bit integer vector containing the minuends. 2599 /// \param __b 2600 /// A 128-bit integer vector containing the subtrahends. 2601 /// \returns A 128-bit integer vector containing the differences of the values 2602 /// in the operands. 2603 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, 2604 __m128i __b) { 2605 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); 2606 } 2607 2608 /// Subtracts, with saturation, corresponding 8-bit unsigned integer values in 2609 /// the input and returns the differences in the corresponding bytes in the 2610 /// destination. 2611 /// 2612 /// Differences less than 0x00 are saturated to 0x00. 2613 /// 2614 /// \headerfile <x86intrin.h> 2615 /// 2616 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2617 /// 2618 /// \param __a 2619 /// A 128-bit integer vector containing the minuends. 2620 /// \param __b 2621 /// A 128-bit integer vector containing the subtrahends. 2622 /// \returns A 128-bit integer vector containing the unsigned integer 2623 /// differences of the values in the operands. 2624 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, 2625 __m128i __b) { 2626 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); 2627 } 2628 2629 /// Subtracts, with saturation, corresponding 16-bit unsigned integer values in 2630 /// the input and returns the differences in the corresponding bytes in the 2631 /// destination. 2632 /// 2633 /// Differences less than 0x0000 are saturated to 0x0000. 2634 /// 2635 /// \headerfile <x86intrin.h> 2636 /// 2637 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2638 /// 2639 /// \param __a 2640 /// A 128-bit integer vector containing the minuends. 2641 /// \param __b 2642 /// A 128-bit integer vector containing the subtrahends. 2643 /// \returns A 128-bit integer vector containing the unsigned integer 2644 /// differences of the values in the operands. 2645 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, 2646 __m128i __b) { 2647 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); 2648 } 2649 2650 /// Performs a bitwise AND of two 128-bit integer vectors. 2651 /// 2652 /// \headerfile <x86intrin.h> 2653 /// 2654 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2655 /// 2656 /// \param __a 2657 /// A 128-bit integer vector containing one of the source operands. 2658 /// \param __b 2659 /// A 128-bit integer vector containing one of the source operands. 2660 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2661 /// in both operands. 2662 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, 2663 __m128i __b) { 2664 return (__m128i)((__v2du)__a & (__v2du)__b); 2665 } 2666 2667 /// Performs a bitwise AND of two 128-bit integer vectors, using the 2668 /// one's complement of the values contained in the first source operand. 2669 /// 2670 /// \headerfile <x86intrin.h> 2671 /// 2672 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2673 /// 2674 /// \param __a 2675 /// A 128-bit vector containing the left source operand. The one's complement 2676 /// of this value is used in the bitwise AND. 2677 /// \param __b 2678 /// A 128-bit vector containing the right source operand. 2679 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2680 /// complement of the first operand and the values in the second operand. 2681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, 2682 __m128i __b) { 2683 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2684 } 2685 /// Performs a bitwise OR of two 128-bit integer vectors. 2686 /// 2687 /// \headerfile <x86intrin.h> 2688 /// 2689 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2690 /// 2691 /// \param __a 2692 /// A 128-bit integer vector containing one of the source operands. 2693 /// \param __b 2694 /// A 128-bit integer vector containing one of the source operands. 2695 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2696 /// in both operands. 2697 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, 2698 __m128i __b) { 2699 return (__m128i)((__v2du)__a | (__v2du)__b); 2700 } 2701 2702 /// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2703 /// 2704 /// \headerfile <x86intrin.h> 2705 /// 2706 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2707 /// 2708 /// \param __a 2709 /// A 128-bit integer vector containing one of the source operands. 2710 /// \param __b 2711 /// A 128-bit integer vector containing one of the source operands. 2712 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2713 /// values in both operands. 2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, 2715 __m128i __b) { 2716 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2717 } 2718 2719 /// Left-shifts the 128-bit integer vector operand by the specified 2720 /// number of bytes. Low-order bits are cleared. 2721 /// 2722 /// \headerfile <x86intrin.h> 2723 /// 2724 /// \code 2725 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2726 /// \endcode 2727 /// 2728 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2729 /// 2730 /// \param a 2731 /// A 128-bit integer vector containing the source operand. 2732 /// \param imm 2733 /// An immediate value specifying the number of bytes to left-shift operand 2734 /// \a a. 2735 /// \returns A 128-bit integer vector containing the left-shifted value. 2736 #define _mm_slli_si128(a, imm) \ 2737 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2738 (int)(imm))) 2739 2740 #define _mm_bslli_si128(a, imm) \ 2741 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2742 (int)(imm))) 2743 2744 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2745 /// by the specified number of bits. Low-order bits are cleared. 2746 /// 2747 /// \headerfile <x86intrin.h> 2748 /// 2749 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2750 /// 2751 /// \param __a 2752 /// A 128-bit integer vector containing the source operand. 2753 /// \param __count 2754 /// An integer value specifying the number of bits to left-shift each value 2755 /// in operand \a __a. 2756 /// \returns A 128-bit integer vector containing the left-shifted values. 2757 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, 2758 int __count) { 2759 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2760 } 2761 2762 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2763 /// by the specified number of bits. Low-order bits are cleared. 2764 /// 2765 /// \headerfile <x86intrin.h> 2766 /// 2767 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2768 /// 2769 /// \param __a 2770 /// A 128-bit integer vector containing the source operand. 2771 /// \param __count 2772 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2773 /// to left-shift each value in operand \a __a. 2774 /// \returns A 128-bit integer vector containing the left-shifted values. 2775 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, 2776 __m128i __count) { 2777 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2778 } 2779 2780 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2781 /// by the specified number of bits. Low-order bits are cleared. 2782 /// 2783 /// \headerfile <x86intrin.h> 2784 /// 2785 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2786 /// 2787 /// \param __a 2788 /// A 128-bit integer vector containing the source operand. 2789 /// \param __count 2790 /// An integer value specifying the number of bits to left-shift each value 2791 /// in operand \a __a. 2792 /// \returns A 128-bit integer vector containing the left-shifted values. 2793 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, 2794 int __count) { 2795 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2796 } 2797 2798 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2799 /// by the specified number of bits. Low-order bits are cleared. 2800 /// 2801 /// \headerfile <x86intrin.h> 2802 /// 2803 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2804 /// 2805 /// \param __a 2806 /// A 128-bit integer vector containing the source operand. 2807 /// \param __count 2808 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2809 /// to left-shift each value in operand \a __a. 2810 /// \returns A 128-bit integer vector containing the left-shifted values. 2811 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, 2812 __m128i __count) { 2813 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2814 } 2815 2816 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2817 /// by the specified number of bits. Low-order bits are cleared. 2818 /// 2819 /// \headerfile <x86intrin.h> 2820 /// 2821 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2822 /// 2823 /// \param __a 2824 /// A 128-bit integer vector containing the source operand. 2825 /// \param __count 2826 /// An integer value specifying the number of bits to left-shift each value 2827 /// in operand \a __a. 2828 /// \returns A 128-bit integer vector containing the left-shifted values. 2829 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, 2830 int __count) { 2831 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2832 } 2833 2834 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2835 /// by the specified number of bits. Low-order bits are cleared. 2836 /// 2837 /// \headerfile <x86intrin.h> 2838 /// 2839 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2840 /// 2841 /// \param __a 2842 /// A 128-bit integer vector containing the source operand. 2843 /// \param __count 2844 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2845 /// to left-shift each value in operand \a __a. 2846 /// \returns A 128-bit integer vector containing the left-shifted values. 2847 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, 2848 __m128i __count) { 2849 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2850 } 2851 2852 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2853 /// by the specified number of bits. High-order bits are filled with the sign 2854 /// bit of the initial value. 2855 /// 2856 /// \headerfile <x86intrin.h> 2857 /// 2858 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2859 /// 2860 /// \param __a 2861 /// A 128-bit integer vector containing the source operand. 2862 /// \param __count 2863 /// An integer value specifying the number of bits to right-shift each value 2864 /// in operand \a __a. 2865 /// \returns A 128-bit integer vector containing the right-shifted values. 2866 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, 2867 int __count) { 2868 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2869 } 2870 2871 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2872 /// by the specified number of bits. High-order bits are filled with the sign 2873 /// bit of the initial value. 2874 /// 2875 /// \headerfile <x86intrin.h> 2876 /// 2877 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2878 /// 2879 /// \param __a 2880 /// A 128-bit integer vector containing the source operand. 2881 /// \param __count 2882 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2883 /// to right-shift each value in operand \a __a. 2884 /// \returns A 128-bit integer vector containing the right-shifted values. 2885 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, 2886 __m128i __count) { 2887 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2888 } 2889 2890 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2891 /// by the specified number of bits. High-order bits are filled with the sign 2892 /// bit of the initial value. 2893 /// 2894 /// \headerfile <x86intrin.h> 2895 /// 2896 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2897 /// 2898 /// \param __a 2899 /// A 128-bit integer vector containing the source operand. 2900 /// \param __count 2901 /// An integer value specifying the number of bits to right-shift each value 2902 /// in operand \a __a. 2903 /// \returns A 128-bit integer vector containing the right-shifted values. 2904 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, 2905 int __count) { 2906 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2907 } 2908 2909 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2910 /// by the specified number of bits. High-order bits are filled with the sign 2911 /// bit of the initial value. 2912 /// 2913 /// \headerfile <x86intrin.h> 2914 /// 2915 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2916 /// 2917 /// \param __a 2918 /// A 128-bit integer vector containing the source operand. 2919 /// \param __count 2920 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2921 /// to right-shift each value in operand \a __a. 2922 /// \returns A 128-bit integer vector containing the right-shifted values. 2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, 2924 __m128i __count) { 2925 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2926 } 2927 2928 /// Right-shifts the 128-bit integer vector operand by the specified 2929 /// number of bytes. High-order bits are cleared. 2930 /// 2931 /// \headerfile <x86intrin.h> 2932 /// 2933 /// \code 2934 /// __m128i _mm_srli_si128(__m128i a, const int imm); 2935 /// \endcode 2936 /// 2937 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 2938 /// 2939 /// \param a 2940 /// A 128-bit integer vector containing the source operand. 2941 /// \param imm 2942 /// An immediate value specifying the number of bytes to right-shift operand 2943 /// \a a. 2944 /// \returns A 128-bit integer vector containing the right-shifted value. 2945 #define _mm_srli_si128(a, imm) \ 2946 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2947 (int)(imm))) 2948 2949 #define _mm_bsrli_si128(a, imm) \ 2950 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2951 (int)(imm))) 2952 2953 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2954 /// operand by the specified number of bits. High-order bits are cleared. 2955 /// 2956 /// \headerfile <x86intrin.h> 2957 /// 2958 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2959 /// 2960 /// \param __a 2961 /// A 128-bit integer vector containing the source operand. 2962 /// \param __count 2963 /// An integer value specifying the number of bits to right-shift each value 2964 /// in operand \a __a. 2965 /// \returns A 128-bit integer vector containing the right-shifted values. 2966 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, 2967 int __count) { 2968 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 2969 } 2970 2971 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2972 /// operand by the specified number of bits. High-order bits are cleared. 2973 /// 2974 /// \headerfile <x86intrin.h> 2975 /// 2976 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2977 /// 2978 /// \param __a 2979 /// A 128-bit integer vector containing the source operand. 2980 /// \param __count 2981 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2982 /// to right-shift each value in operand \a __a. 2983 /// \returns A 128-bit integer vector containing the right-shifted values. 2984 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, 2985 __m128i __count) { 2986 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 2987 } 2988 2989 /// Right-shifts each of 32-bit values in the 128-bit integer vector 2990 /// operand by the specified number of bits. High-order bits are cleared. 2991 /// 2992 /// \headerfile <x86intrin.h> 2993 /// 2994 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2995 /// 2996 /// \param __a 2997 /// A 128-bit integer vector containing the source operand. 2998 /// \param __count 2999 /// An integer value specifying the number of bits to right-shift each value 3000 /// in operand \a __a. 3001 /// \returns A 128-bit integer vector containing the right-shifted values. 3002 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, 3003 int __count) { 3004 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 3005 } 3006 3007 /// Right-shifts each of 32-bit values in the 128-bit integer vector 3008 /// operand by the specified number of bits. High-order bits are cleared. 3009 /// 3010 /// \headerfile <x86intrin.h> 3011 /// 3012 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 3013 /// 3014 /// \param __a 3015 /// A 128-bit integer vector containing the source operand. 3016 /// \param __count 3017 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3018 /// to right-shift each value in operand \a __a. 3019 /// \returns A 128-bit integer vector containing the right-shifted values. 3020 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, 3021 __m128i __count) { 3022 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 3023 } 3024 3025 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3026 /// operand by the specified number of bits. High-order bits are cleared. 3027 /// 3028 /// \headerfile <x86intrin.h> 3029 /// 3030 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3031 /// 3032 /// \param __a 3033 /// A 128-bit integer vector containing the source operand. 3034 /// \param __count 3035 /// An integer value specifying the number of bits to right-shift each value 3036 /// in operand \a __a. 3037 /// \returns A 128-bit integer vector containing the right-shifted values. 3038 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, 3039 int __count) { 3040 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 3041 } 3042 3043 /// Right-shifts each of 64-bit values in the 128-bit integer vector 3044 /// operand by the specified number of bits. High-order bits are cleared. 3045 /// 3046 /// \headerfile <x86intrin.h> 3047 /// 3048 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 3049 /// 3050 /// \param __a 3051 /// A 128-bit integer vector containing the source operand. 3052 /// \param __count 3053 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3054 /// to right-shift each value in operand \a __a. 3055 /// \returns A 128-bit integer vector containing the right-shifted values. 3056 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, 3057 __m128i __count) { 3058 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3059 } 3060 3061 /// Compares each of the corresponding 8-bit values of the 128-bit 3062 /// integer vectors for equality. 3063 /// 3064 /// Each comparison returns 0x0 for false, 0xFF for true. 3065 /// 3066 /// \headerfile <x86intrin.h> 3067 /// 3068 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3069 /// 3070 /// \param __a 3071 /// A 128-bit integer vector. 3072 /// \param __b 3073 /// A 128-bit integer vector. 3074 /// \returns A 128-bit integer vector containing the comparison results. 3075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, 3076 __m128i __b) { 3077 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3078 } 3079 3080 /// Compares each of the corresponding 16-bit values of the 128-bit 3081 /// integer vectors for equality. 3082 /// 3083 /// Each comparison returns 0x0 for false, 0xFFFF for true. 3084 /// 3085 /// \headerfile <x86intrin.h> 3086 /// 3087 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3088 /// 3089 /// \param __a 3090 /// A 128-bit integer vector. 3091 /// \param __b 3092 /// A 128-bit integer vector. 3093 /// \returns A 128-bit integer vector containing the comparison results. 3094 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, 3095 __m128i __b) { 3096 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3097 } 3098 3099 /// Compares each of the corresponding 32-bit values of the 128-bit 3100 /// integer vectors for equality. 3101 /// 3102 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 3103 /// 3104 /// \headerfile <x86intrin.h> 3105 /// 3106 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3107 /// 3108 /// \param __a 3109 /// A 128-bit integer vector. 3110 /// \param __b 3111 /// A 128-bit integer vector. 3112 /// \returns A 128-bit integer vector containing the comparison results. 3113 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, 3114 __m128i __b) { 3115 return (__m128i)((__v4si)__a == (__v4si)__b); 3116 } 3117 3118 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3119 /// integer vectors to determine if the values in the first operand are 3120 /// greater than those in the second operand. 3121 /// 3122 /// Each comparison returns 0x0 for false, 0xFF for true. 3123 /// 3124 /// \headerfile <x86intrin.h> 3125 /// 3126 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3127 /// 3128 /// \param __a 3129 /// A 128-bit integer vector. 3130 /// \param __b 3131 /// A 128-bit integer vector. 3132 /// \returns A 128-bit integer vector containing the comparison results. 3133 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, 3134 __m128i __b) { 3135 /* This function always performs a signed comparison, but __v16qi is a char 3136 which may be signed or unsigned, so use __v16qs. */ 3137 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3138 } 3139 3140 /// Compares each of the corresponding signed 16-bit values of the 3141 /// 128-bit integer vectors to determine if the values in the first operand 3142 /// are greater than those in the second operand. 3143 /// 3144 /// Each comparison returns 0x0 for false, 0xFFFF for true. 3145 /// 3146 /// \headerfile <x86intrin.h> 3147 /// 3148 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3149 /// 3150 /// \param __a 3151 /// A 128-bit integer vector. 3152 /// \param __b 3153 /// A 128-bit integer vector. 3154 /// \returns A 128-bit integer vector containing the comparison results. 3155 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, 3156 __m128i __b) { 3157 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3158 } 3159 3160 /// Compares each of the corresponding signed 32-bit values of the 3161 /// 128-bit integer vectors to determine if the values in the first operand 3162 /// are greater than those in the second operand. 3163 /// 3164 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 3165 /// 3166 /// \headerfile <x86intrin.h> 3167 /// 3168 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3169 /// 3170 /// \param __a 3171 /// A 128-bit integer vector. 3172 /// \param __b 3173 /// A 128-bit integer vector. 3174 /// \returns A 128-bit integer vector containing the comparison results. 3175 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, 3176 __m128i __b) { 3177 return (__m128i)((__v4si)__a > (__v4si)__b); 3178 } 3179 3180 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3181 /// integer vectors to determine if the values in the first operand are less 3182 /// than those in the second operand. 3183 /// 3184 /// Each comparison returns 0x0 for false, 0xFF for true. 3185 /// 3186 /// \headerfile <x86intrin.h> 3187 /// 3188 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3189 /// 3190 /// \param __a 3191 /// A 128-bit integer vector. 3192 /// \param __b 3193 /// A 128-bit integer vector. 3194 /// \returns A 128-bit integer vector containing the comparison results. 3195 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, 3196 __m128i __b) { 3197 return _mm_cmpgt_epi8(__b, __a); 3198 } 3199 3200 /// Compares each of the corresponding signed 16-bit values of the 3201 /// 128-bit integer vectors to determine if the values in the first operand 3202 /// are less than those in the second operand. 3203 /// 3204 /// Each comparison returns 0x0 for false, 0xFFFF for true. 3205 /// 3206 /// \headerfile <x86intrin.h> 3207 /// 3208 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3209 /// 3210 /// \param __a 3211 /// A 128-bit integer vector. 3212 /// \param __b 3213 /// A 128-bit integer vector. 3214 /// \returns A 128-bit integer vector containing the comparison results. 3215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, 3216 __m128i __b) { 3217 return _mm_cmpgt_epi16(__b, __a); 3218 } 3219 3220 /// Compares each of the corresponding signed 32-bit values of the 3221 /// 128-bit integer vectors to determine if the values in the first operand 3222 /// are less than those in the second operand. 3223 /// 3224 /// Each comparison returns 0x0 for false, 0xFFFFFFFF for true. 3225 /// 3226 /// \headerfile <x86intrin.h> 3227 /// 3228 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3229 /// 3230 /// \param __a 3231 /// A 128-bit integer vector. 3232 /// \param __b 3233 /// A 128-bit integer vector. 3234 /// \returns A 128-bit integer vector containing the comparison results. 3235 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, 3236 __m128i __b) { 3237 return _mm_cmpgt_epi32(__b, __a); 3238 } 3239 3240 #ifdef __x86_64__ 3241 /// Converts a 64-bit signed integer value from the second operand into a 3242 /// double-precision value and returns it in the lower element of a [2 x 3243 /// double] vector; the upper element of the returned vector is copied from 3244 /// the upper element of the first operand. 3245 /// 3246 /// \headerfile <x86intrin.h> 3247 /// 3248 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3249 /// 3250 /// \param __a 3251 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3252 /// copied to the upper 64 bits of the destination. 3253 /// \param __b 3254 /// A 64-bit signed integer operand containing the value to be converted. 3255 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3256 /// converted value of the second operand. The upper 64 bits are copied from 3257 /// the upper 64 bits of the first operand. 3258 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a, 3259 long long __b) { 3260 __a[0] = __b; 3261 return __a; 3262 } 3263 3264 /// Converts the first (lower) element of a vector of [2 x double] into a 3265 /// 64-bit signed integer value. 3266 /// 3267 /// If the converted value does not fit in a 64-bit integer, raises a 3268 /// floating-point invalid exception. If the exception is masked, returns 3269 /// the most negative integer. 3270 /// 3271 /// \headerfile <x86intrin.h> 3272 /// 3273 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3274 /// 3275 /// \param __a 3276 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3277 /// conversion. 3278 /// \returns A 64-bit signed integer containing the converted value. 3279 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) { 3280 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3281 } 3282 3283 /// Converts the first (lower) element of a vector of [2 x double] into a 3284 /// 64-bit signed truncated (rounded toward zero) integer value. 3285 /// 3286 /// If a converted value does not fit in a 64-bit integer, raises a 3287 /// floating-point invalid exception. If the exception is masked, returns 3288 /// the most negative integer. 3289 /// 3290 /// \headerfile <x86intrin.h> 3291 /// 3292 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3293 /// instruction. 3294 /// 3295 /// \param __a 3296 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3297 /// conversion. 3298 /// \returns A 64-bit signed integer containing the converted value. 3299 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) { 3300 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3301 } 3302 #endif 3303 3304 /// Converts a vector of [4 x i32] into a vector of [4 x float]. 3305 /// 3306 /// \headerfile <x86intrin.h> 3307 /// 3308 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3309 /// 3310 /// \param __a 3311 /// A 128-bit integer vector. 3312 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3313 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { 3314 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf); 3315 } 3316 3317 /// Converts a vector of [4 x float] into a vector of [4 x i32]. 3318 /// 3319 /// If a converted value does not fit in a 32-bit integer, raises a 3320 /// floating-point invalid exception. If the exception is masked, returns 3321 /// the most negative integer. 3322 /// 3323 /// \headerfile <x86intrin.h> 3324 /// 3325 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3326 /// 3327 /// \param __a 3328 /// A 128-bit vector of [4 x float]. 3329 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3330 /// values. 3331 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) { 3332 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3333 } 3334 3335 /// Converts a vector of [4 x float] into four signed truncated (rounded toward 3336 /// zero) 32-bit integers, returned in a vector of [4 x i32]. 3337 /// 3338 /// If a converted value does not fit in a 32-bit integer, raises a 3339 /// floating-point invalid exception. If the exception is masked, returns 3340 /// the most negative integer. 3341 /// 3342 /// \headerfile <x86intrin.h> 3343 /// 3344 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3345 /// instruction. 3346 /// 3347 /// \param __a 3348 /// A 128-bit vector of [4 x float]. 3349 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3350 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) { 3351 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3352 } 3353 3354 /// Returns a vector of [4 x i32] where the lowest element is the input 3355 /// operand and the remaining elements are zero. 3356 /// 3357 /// \headerfile <x86intrin.h> 3358 /// 3359 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3360 /// 3361 /// \param __a 3362 /// A 32-bit signed integer operand. 3363 /// \returns A 128-bit vector of [4 x i32]. 3364 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { 3365 return __extension__(__m128i)(__v4si){__a, 0, 0, 0}; 3366 } 3367 3368 /// Returns a vector of [2 x i64] where the lower element is the input 3369 /// operand and the upper element is zero. 3370 /// 3371 /// \headerfile <x86intrin.h> 3372 /// 3373 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction 3374 /// in 64-bit mode. 3375 /// 3376 /// \param __a 3377 /// A 64-bit signed integer operand containing the value to be converted. 3378 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3379 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { 3380 return __extension__(__m128i)(__v2di){__a, 0}; 3381 } 3382 3383 /// Moves the least significant 32 bits of a vector of [4 x i32] to a 3384 /// 32-bit signed integer value. 3385 /// 3386 /// \headerfile <x86intrin.h> 3387 /// 3388 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3389 /// 3390 /// \param __a 3391 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3392 /// destination. 3393 /// \returns A 32-bit signed integer containing the moved value. 3394 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { 3395 __v4si __b = (__v4si)__a; 3396 return __b[0]; 3397 } 3398 3399 /// Moves the least significant 64 bits of a vector of [2 x i64] to a 3400 /// 64-bit signed integer value. 3401 /// 3402 /// \headerfile <x86intrin.h> 3403 /// 3404 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3405 /// 3406 /// \param __a 3407 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3408 /// destination. 3409 /// \returns A 64-bit signed integer containing the moved value. 3410 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) { 3411 return __a[0]; 3412 } 3413 3414 /// Moves packed integer values from an aligned 128-bit memory location 3415 /// to elements in a 128-bit integer vector. 3416 /// 3417 /// \headerfile <x86intrin.h> 3418 /// 3419 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3420 /// 3421 /// \param __p 3422 /// An aligned pointer to a memory location containing integer values. 3423 /// \returns A 128-bit integer vector containing the moved values. 3424 static __inline__ __m128i __DEFAULT_FN_ATTRS 3425 _mm_load_si128(__m128i const *__p) { 3426 return *__p; 3427 } 3428 3429 /// Moves packed integer values from an unaligned 128-bit memory location 3430 /// to elements in a 128-bit integer vector. 3431 /// 3432 /// \headerfile <x86intrin.h> 3433 /// 3434 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3435 /// 3436 /// \param __p 3437 /// A pointer to a memory location containing integer values. 3438 /// \returns A 128-bit integer vector containing the moved values. 3439 static __inline__ __m128i __DEFAULT_FN_ATTRS 3440 _mm_loadu_si128(__m128i_u const *__p) { 3441 struct __loadu_si128 { 3442 __m128i_u __v; 3443 } __attribute__((__packed__, __may_alias__)); 3444 return ((const struct __loadu_si128 *)__p)->__v; 3445 } 3446 3447 /// Returns a vector of [2 x i64] where the lower element is taken from 3448 /// the lower element of the operand, and the upper element is zero. 3449 /// 3450 /// \headerfile <x86intrin.h> 3451 /// 3452 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3453 /// 3454 /// \param __p 3455 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3456 /// the destination. 3457 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3458 /// moved value. The higher order bits are cleared. 3459 static __inline__ __m128i __DEFAULT_FN_ATTRS 3460 _mm_loadl_epi64(__m128i_u const *__p) { 3461 struct __mm_loadl_epi64_struct { 3462 long long __u; 3463 } __attribute__((__packed__, __may_alias__)); 3464 return __extension__(__m128i){ 3465 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0}; 3466 } 3467 3468 /// Generates a 128-bit vector of [4 x i32] with unspecified content. 3469 /// This could be used as an argument to another intrinsic function where the 3470 /// argument is required but the value is not actually used. 3471 /// 3472 /// \headerfile <x86intrin.h> 3473 /// 3474 /// This intrinsic has no corresponding instruction. 3475 /// 3476 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3477 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) { 3478 return (__m128i)__builtin_ia32_undef128(); 3479 } 3480 3481 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3482 /// the specified 64-bit integer values. 3483 /// 3484 /// \headerfile <x86intrin.h> 3485 /// 3486 /// This intrinsic is a utility function and does not correspond to a specific 3487 /// instruction. 3488 /// 3489 /// \param __q1 3490 /// A 64-bit integer value used to initialize the upper 64 bits of the 3491 /// destination vector of [2 x i64]. 3492 /// \param __q0 3493 /// A 64-bit integer value used to initialize the lower 64 bits of the 3494 /// destination vector of [2 x i64]. 3495 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3496 /// provided in the operands. 3497 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, 3498 long long __q0) { 3499 return __extension__(__m128i)(__v2di){__q0, __q1}; 3500 } 3501 3502 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3503 /// the specified 64-bit integer values. 3504 /// 3505 /// \headerfile <x86intrin.h> 3506 /// 3507 /// This intrinsic is a utility function and does not correspond to a specific 3508 /// instruction. 3509 /// 3510 /// \param __q1 3511 /// A 64-bit integer value used to initialize the upper 64 bits of the 3512 /// destination vector of [2 x i64]. 3513 /// \param __q0 3514 /// A 64-bit integer value used to initialize the lower 64 bits of the 3515 /// destination vector of [2 x i64]. 3516 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3517 /// provided in the operands. 3518 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, 3519 __m64 __q0) { 3520 return _mm_set_epi64x((long long)__q1, (long long)__q0); 3521 } 3522 3523 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3524 /// the specified 32-bit integer values. 3525 /// 3526 /// \headerfile <x86intrin.h> 3527 /// 3528 /// This intrinsic is a utility function and does not correspond to a specific 3529 /// instruction. 3530 /// 3531 /// \param __i3 3532 /// A 32-bit integer value used to initialize bits [127:96] of the 3533 /// destination vector. 3534 /// \param __i2 3535 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3536 /// vector. 3537 /// \param __i1 3538 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3539 /// vector. 3540 /// \param __i0 3541 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3542 /// vector. 3543 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3544 /// provided in the operands. 3545 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, 3546 int __i1, int __i0) { 3547 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3}; 3548 } 3549 3550 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3551 /// the specified 16-bit integer values. 3552 /// 3553 /// \headerfile <x86intrin.h> 3554 /// 3555 /// This intrinsic is a utility function and does not correspond to a specific 3556 /// instruction. 3557 /// 3558 /// \param __w7 3559 /// A 16-bit integer value used to initialize bits [127:112] of the 3560 /// destination vector. 3561 /// \param __w6 3562 /// A 16-bit integer value used to initialize bits [111:96] of the 3563 /// destination vector. 3564 /// \param __w5 3565 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3566 /// vector. 3567 /// \param __w4 3568 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3569 /// vector. 3570 /// \param __w3 3571 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3572 /// vector. 3573 /// \param __w2 3574 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3575 /// vector. 3576 /// \param __w1 3577 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3578 /// vector. 3579 /// \param __w0 3580 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3581 /// vector. 3582 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3583 /// provided in the operands. 3584 static __inline__ __m128i __DEFAULT_FN_ATTRS 3585 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, 3586 short __w2, short __w1, short __w0) { 3587 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3, 3588 __w4, __w5, __w6, __w7}; 3589 } 3590 3591 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3592 /// the specified 8-bit integer values. 3593 /// 3594 /// \headerfile <x86intrin.h> 3595 /// 3596 /// This intrinsic is a utility function and does not correspond to a specific 3597 /// instruction. 3598 /// 3599 /// \param __b15 3600 /// Initializes bits [127:120] of the destination vector. 3601 /// \param __b14 3602 /// Initializes bits [119:112] of the destination vector. 3603 /// \param __b13 3604 /// Initializes bits [111:104] of the destination vector. 3605 /// \param __b12 3606 /// Initializes bits [103:96] of the destination vector. 3607 /// \param __b11 3608 /// Initializes bits [95:88] of the destination vector. 3609 /// \param __b10 3610 /// Initializes bits [87:80] of the destination vector. 3611 /// \param __b9 3612 /// Initializes bits [79:72] of the destination vector. 3613 /// \param __b8 3614 /// Initializes bits [71:64] of the destination vector. 3615 /// \param __b7 3616 /// Initializes bits [63:56] of the destination vector. 3617 /// \param __b6 3618 /// Initializes bits [55:48] of the destination vector. 3619 /// \param __b5 3620 /// Initializes bits [47:40] of the destination vector. 3621 /// \param __b4 3622 /// Initializes bits [39:32] of the destination vector. 3623 /// \param __b3 3624 /// Initializes bits [31:24] of the destination vector. 3625 /// \param __b2 3626 /// Initializes bits [23:16] of the destination vector. 3627 /// \param __b1 3628 /// Initializes bits [15:8] of the destination vector. 3629 /// \param __b0 3630 /// Initializes bits [7:0] of the destination vector. 3631 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3632 /// provided in the operands. 3633 static __inline__ __m128i __DEFAULT_FN_ATTRS 3634 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, 3635 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, 3636 char __b4, char __b3, char __b2, char __b1, char __b0) { 3637 return __extension__(__m128i)(__v16qi){ 3638 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, 3639 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15}; 3640 } 3641 3642 /// Initializes both values in a 128-bit integer vector with the 3643 /// specified 64-bit integer value. 3644 /// 3645 /// \headerfile <x86intrin.h> 3646 /// 3647 /// This intrinsic is a utility function and does not correspond to a specific 3648 /// instruction. 3649 /// 3650 /// \param __q 3651 /// Integer value used to initialize the elements of the destination integer 3652 /// vector. 3653 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3654 /// elements containing the value provided in the operand. 3655 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { 3656 return _mm_set_epi64x(__q, __q); 3657 } 3658 3659 /// Initializes both values in a 128-bit vector of [2 x i64] with the 3660 /// specified 64-bit value. 3661 /// 3662 /// \headerfile <x86intrin.h> 3663 /// 3664 /// This intrinsic is a utility function and does not correspond to a specific 3665 /// instruction. 3666 /// 3667 /// \param __q 3668 /// A 64-bit value used to initialize the elements of the destination integer 3669 /// vector. 3670 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3671 /// containing the value provided in the operand. 3672 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { 3673 return _mm_set_epi64(__q, __q); 3674 } 3675 3676 /// Initializes all values in a 128-bit vector of [4 x i32] with the 3677 /// specified 32-bit value. 3678 /// 3679 /// \headerfile <x86intrin.h> 3680 /// 3681 /// This intrinsic is a utility function and does not correspond to a specific 3682 /// instruction. 3683 /// 3684 /// \param __i 3685 /// A 32-bit value used to initialize the elements of the destination integer 3686 /// vector. 3687 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3688 /// containing the value provided in the operand. 3689 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { 3690 return _mm_set_epi32(__i, __i, __i, __i); 3691 } 3692 3693 /// Initializes all values in a 128-bit vector of [8 x i16] with the 3694 /// specified 16-bit value. 3695 /// 3696 /// \headerfile <x86intrin.h> 3697 /// 3698 /// This intrinsic is a utility function and does not correspond to a specific 3699 /// instruction. 3700 /// 3701 /// \param __w 3702 /// A 16-bit value used to initialize the elements of the destination integer 3703 /// vector. 3704 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3705 /// containing the value provided in the operand. 3706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { 3707 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3708 } 3709 3710 /// Initializes all values in a 128-bit vector of [16 x i8] with the 3711 /// specified 8-bit value. 3712 /// 3713 /// \headerfile <x86intrin.h> 3714 /// 3715 /// This intrinsic is a utility function and does not correspond to a specific 3716 /// instruction. 3717 /// 3718 /// \param __b 3719 /// An 8-bit value used to initialize the elements of the destination integer 3720 /// vector. 3721 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3722 /// containing the value provided in the operand. 3723 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { 3724 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 3725 __b, __b, __b, __b, __b); 3726 } 3727 3728 /// Constructs a 128-bit integer vector, initialized in reverse order 3729 /// with the specified 64-bit integral values. 3730 /// 3731 /// \headerfile <x86intrin.h> 3732 /// 3733 /// This intrinsic does not correspond to a specific instruction. 3734 /// 3735 /// \param __q0 3736 /// A 64-bit integral value used to initialize the lower 64 bits of the 3737 /// result. 3738 /// \param __q1 3739 /// A 64-bit integral value used to initialize the upper 64 bits of the 3740 /// result. 3741 /// \returns An initialized 128-bit integer vector. 3742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, 3743 __m64 __q1) { 3744 return _mm_set_epi64(__q1, __q0); 3745 } 3746 3747 /// Constructs a 128-bit integer vector, initialized in reverse order 3748 /// with the specified 32-bit integral values. 3749 /// 3750 /// \headerfile <x86intrin.h> 3751 /// 3752 /// This intrinsic is a utility function and does not correspond to a specific 3753 /// instruction. 3754 /// 3755 /// \param __i0 3756 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3757 /// \param __i1 3758 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3759 /// \param __i2 3760 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3761 /// \param __i3 3762 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3763 /// \returns An initialized 128-bit integer vector. 3764 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, 3765 int __i2, 3766 int __i3) { 3767 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3768 } 3769 3770 /// Constructs a 128-bit integer vector, initialized in reverse order 3771 /// with the specified 16-bit integral values. 3772 /// 3773 /// \headerfile <x86intrin.h> 3774 /// 3775 /// This intrinsic is a utility function and does not correspond to a specific 3776 /// instruction. 3777 /// 3778 /// \param __w0 3779 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3780 /// \param __w1 3781 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3782 /// \param __w2 3783 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3784 /// \param __w3 3785 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3786 /// \param __w4 3787 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3788 /// \param __w5 3789 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3790 /// \param __w6 3791 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3792 /// \param __w7 3793 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3794 /// \returns An initialized 128-bit integer vector. 3795 static __inline__ __m128i __DEFAULT_FN_ATTRS 3796 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, 3797 short __w5, short __w6, short __w7) { 3798 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3799 } 3800 3801 /// Constructs a 128-bit integer vector, initialized in reverse order 3802 /// with the specified 8-bit integral values. 3803 /// 3804 /// \headerfile <x86intrin.h> 3805 /// 3806 /// This intrinsic is a utility function and does not correspond to a specific 3807 /// instruction. 3808 /// 3809 /// \param __b0 3810 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3811 /// \param __b1 3812 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3813 /// \param __b2 3814 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3815 /// \param __b3 3816 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3817 /// \param __b4 3818 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3819 /// \param __b5 3820 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3821 /// \param __b6 3822 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3823 /// \param __b7 3824 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3825 /// \param __b8 3826 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3827 /// \param __b9 3828 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3829 /// \param __b10 3830 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3831 /// \param __b11 3832 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3833 /// \param __b12 3834 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3835 /// \param __b13 3836 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3837 /// \param __b14 3838 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3839 /// \param __b15 3840 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3841 /// \returns An initialized 128-bit integer vector. 3842 static __inline__ __m128i __DEFAULT_FN_ATTRS 3843 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, 3844 char __b6, char __b7, char __b8, char __b9, char __b10, 3845 char __b11, char __b12, char __b13, char __b14, char __b15) { 3846 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, 3847 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3848 } 3849 3850 /// Creates a 128-bit integer vector initialized to zero. 3851 /// 3852 /// \headerfile <x86intrin.h> 3853 /// 3854 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3855 /// 3856 /// \returns An initialized 128-bit integer vector with all elements set to 3857 /// zero. 3858 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { 3859 return __extension__(__m128i)(__v2di){0LL, 0LL}; 3860 } 3861 3862 /// Stores a 128-bit integer vector to a memory location aligned on a 3863 /// 128-bit boundary. 3864 /// 3865 /// \headerfile <x86intrin.h> 3866 /// 3867 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3868 /// 3869 /// \param __p 3870 /// A pointer to an aligned memory location that will receive the integer 3871 /// values. 3872 /// \param __b 3873 /// A 128-bit integer vector containing the values to be moved. 3874 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, 3875 __m128i __b) { 3876 *__p = __b; 3877 } 3878 3879 /// Stores a 128-bit integer vector to an unaligned memory location. 3880 /// 3881 /// \headerfile <x86intrin.h> 3882 /// 3883 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 3884 /// 3885 /// \param __p 3886 /// A pointer to a memory location that will receive the integer values. 3887 /// \param __b 3888 /// A 128-bit integer vector containing the values to be moved. 3889 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, 3890 __m128i __b) { 3891 struct __storeu_si128 { 3892 __m128i_u __v; 3893 } __attribute__((__packed__, __may_alias__)); 3894 ((struct __storeu_si128 *)__p)->__v = __b; 3895 } 3896 3897 /// Stores a 64-bit integer value from the low element of a 128-bit integer 3898 /// vector. 3899 /// 3900 /// \headerfile <x86intrin.h> 3901 /// 3902 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3903 /// 3904 /// \param __p 3905 /// A pointer to a 64-bit memory location. The address of the memory 3906 /// location does not have to be aligned. 3907 /// \param __b 3908 /// A 128-bit integer vector containing the value to be stored. 3909 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, 3910 __m128i __b) { 3911 struct __storeu_si64 { 3912 long long __v; 3913 } __attribute__((__packed__, __may_alias__)); 3914 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0]; 3915 } 3916 3917 /// Stores a 32-bit integer value from the low element of a 128-bit integer 3918 /// vector. 3919 /// 3920 /// \headerfile <x86intrin.h> 3921 /// 3922 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3923 /// 3924 /// \param __p 3925 /// A pointer to a 32-bit memory location. The address of the memory 3926 /// location does not have to be aligned. 3927 /// \param __b 3928 /// A 128-bit integer vector containing the value to be stored. 3929 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, 3930 __m128i __b) { 3931 struct __storeu_si32 { 3932 int __v; 3933 } __attribute__((__packed__, __may_alias__)); 3934 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0]; 3935 } 3936 3937 /// Stores a 16-bit integer value from the low element of a 128-bit integer 3938 /// vector. 3939 /// 3940 /// \headerfile <x86intrin.h> 3941 /// 3942 /// This intrinsic does not correspond to a specific instruction. 3943 /// 3944 /// \param __p 3945 /// A pointer to a 16-bit memory location. The address of the memory 3946 /// location does not have to be aligned. 3947 /// \param __b 3948 /// A 128-bit integer vector containing the value to be stored. 3949 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, 3950 __m128i __b) { 3951 struct __storeu_si16 { 3952 short __v; 3953 } __attribute__((__packed__, __may_alias__)); 3954 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0]; 3955 } 3956 3957 /// Moves bytes selected by the mask from the first operand to the 3958 /// specified unaligned memory location. When a mask bit is 1, the 3959 /// corresponding byte is written, otherwise it is not written. 3960 /// 3961 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3962 /// used again soon). Exception and trap behavior for elements not selected 3963 /// for storage to memory are implementation dependent. 3964 /// 3965 /// \headerfile <x86intrin.h> 3966 /// 3967 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 3968 /// instruction. 3969 /// 3970 /// \param __d 3971 /// A 128-bit integer vector containing the values to be moved. 3972 /// \param __n 3973 /// A 128-bit integer vector containing the mask. The most significant bit of 3974 /// each byte represents the mask bits. 3975 /// \param __p 3976 /// A pointer to an unaligned 128-bit memory location where the specified 3977 /// values are moved. 3978 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, 3979 __m128i __n, 3980 char *__p) { 3981 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 3982 } 3983 3984 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 3985 /// a memory location. 3986 /// 3987 /// \headerfile <x86intrin.h> 3988 /// 3989 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 3990 /// 3991 /// \param __p 3992 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 3993 /// of the integer vector parameter. 3994 /// \param __a 3995 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 3996 /// value to be stored. 3997 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, 3998 __m128i __a) { 3999 struct __mm_storel_epi64_struct { 4000 long long __u; 4001 } __attribute__((__packed__, __may_alias__)); 4002 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0]; 4003 } 4004 4005 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 4006 /// aligned memory location. 4007 /// 4008 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4009 /// used again soon). 4010 /// 4011 /// \headerfile <x86intrin.h> 4012 /// 4013 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4014 /// 4015 /// \param __p 4016 /// A pointer to the 128-bit aligned memory location used to store the value. 4017 /// \param __a 4018 /// A vector of [2 x double] containing the 64-bit values to be stored. 4019 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p, 4020 __m128d __a) { 4021 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p); 4022 } 4023 4024 /// Stores a 128-bit integer vector to a 128-bit aligned memory location. 4025 /// 4026 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4027 /// used again soon). 4028 /// 4029 /// \headerfile <x86intrin.h> 4030 /// 4031 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 4032 /// 4033 /// \param __p 4034 /// A pointer to the 128-bit aligned memory location used to store the value. 4035 /// \param __a 4036 /// A 128-bit integer vector containing the values to be stored. 4037 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p, 4038 __m128i __a) { 4039 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p); 4040 } 4041 4042 /// Stores a 32-bit integer value in the specified memory location. 4043 /// 4044 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4045 /// used again soon). 4046 /// 4047 /// \headerfile <x86intrin.h> 4048 /// 4049 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 4050 /// 4051 /// \param __p 4052 /// A pointer to the 32-bit memory location used to store the value. 4053 /// \param __a 4054 /// A 32-bit integer containing the value to be stored. 4055 static __inline__ void 4056 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4057 _mm_stream_si32(void *__p, int __a) { 4058 __builtin_ia32_movnti((int *)__p, __a); 4059 } 4060 4061 #ifdef __x86_64__ 4062 /// Stores a 64-bit integer value in the specified memory location. 4063 /// 4064 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 4065 /// used again soon). 4066 /// 4067 /// \headerfile <x86intrin.h> 4068 /// 4069 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 4070 /// 4071 /// \param __p 4072 /// A pointer to the 64-bit memory location used to store the value. 4073 /// \param __a 4074 /// A 64-bit integer containing the value to be stored. 4075 static __inline__ void 4076 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4077 _mm_stream_si64(void *__p, long long __a) { 4078 __builtin_ia32_movnti64((long long *)__p, __a); 4079 } 4080 #endif 4081 4082 #if defined(__cplusplus) 4083 extern "C" { 4084 #endif 4085 4086 /// The cache line containing \a __p is flushed and invalidated from all 4087 /// caches in the coherency domain. 4088 /// 4089 /// \headerfile <x86intrin.h> 4090 /// 4091 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4092 /// 4093 /// \param __p 4094 /// A pointer to the memory location used to identify the cache line to be 4095 /// flushed. 4096 void _mm_clflush(void const *__p); 4097 4098 /// Forces strong memory ordering (serialization) between load 4099 /// instructions preceding this instruction and load instructions following 4100 /// this instruction, ensuring the system completes all previous loads before 4101 /// executing subsequent loads. 4102 /// 4103 /// \headerfile <x86intrin.h> 4104 /// 4105 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4106 /// 4107 void _mm_lfence(void); 4108 4109 /// Forces strong memory ordering (serialization) between load and store 4110 /// instructions preceding this instruction and load and store instructions 4111 /// following this instruction, ensuring that the system completes all 4112 /// previous memory accesses before executing subsequent memory accesses. 4113 /// 4114 /// \headerfile <x86intrin.h> 4115 /// 4116 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4117 /// 4118 void _mm_mfence(void); 4119 4120 #if defined(__cplusplus) 4121 } // extern "C" 4122 #endif 4123 4124 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer 4125 /// vector operands into 8-bit signed integers, and packs the results into 4126 /// the destination. 4127 /// 4128 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values 4129 /// less than 0x80 are saturated to 0x80. 4130 /// 4131 /// \headerfile <x86intrin.h> 4132 /// 4133 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4134 /// 4135 /// \param __a 4136 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4137 /// written to the lower 64 bits of the result. 4138 /// \param __b 4139 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4140 /// written to the higher 64 bits of the result. 4141 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4142 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, 4143 __m128i __b) { 4144 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4145 } 4146 4147 /// Converts, with saturation, 32-bit signed integers from both 128-bit integer 4148 /// vector operands into 16-bit signed integers, and packs the results into 4149 /// the destination. 4150 /// 4151 /// Positive values greater than 0x7FFF are saturated to 0x7FFF. Negative 4152 /// values less than 0x8000 are saturated to 0x8000. 4153 /// 4154 /// \headerfile <x86intrin.h> 4155 /// 4156 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4157 /// 4158 /// \param __a 4159 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values 4160 /// are written to the lower 64 bits of the result. 4161 /// \param __b 4162 /// A 128-bit integer vector of [4 x i32]. The converted [4 x i16] values 4163 /// are written to the higher 64 bits of the result. 4164 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4165 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, 4166 __m128i __b) { 4167 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4168 } 4169 4170 /// Converts, with saturation, 16-bit signed integers from both 128-bit integer 4171 /// vector operands into 8-bit unsigned integers, and packs the results into 4172 /// the destination. 4173 /// 4174 /// Values greater than 0xFF are saturated to 0xFF. Values less than 0x00 4175 /// are saturated to 0x00. 4176 /// 4177 /// \headerfile <x86intrin.h> 4178 /// 4179 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4180 /// 4181 /// \param __a 4182 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4183 /// written to the lower 64 bits of the result. 4184 /// \param __b 4185 /// A 128-bit integer vector of [8 x i16]. The converted [8 x i8] values are 4186 /// written to the higher 64 bits of the result. 4187 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4188 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, 4189 __m128i __b) { 4190 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4191 } 4192 4193 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4194 /// the immediate-value parameter as a selector. 4195 /// 4196 /// \headerfile <x86intrin.h> 4197 /// 4198 /// \code 4199 /// __m128i _mm_extract_epi16(__m128i a, const int imm); 4200 /// \endcode 4201 /// 4202 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4203 /// 4204 /// \param a 4205 /// A 128-bit integer vector. 4206 /// \param imm 4207 /// An immediate value. Bits [2:0] selects values from \a a to be assigned 4208 /// to bits[15:0] of the result. \n 4209 /// 000: assign values from bits [15:0] of \a a. \n 4210 /// 001: assign values from bits [31:16] of \a a. \n 4211 /// 010: assign values from bits [47:32] of \a a. \n 4212 /// 011: assign values from bits [63:48] of \a a. \n 4213 /// 100: assign values from bits [79:64] of \a a. \n 4214 /// 101: assign values from bits [95:80] of \a a. \n 4215 /// 110: assign values from bits [111:96] of \a a. \n 4216 /// 111: assign values from bits [127:112] of \a a. 4217 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4218 /// integer vector parameter and the remaining bits are assigned zeros. 4219 #define _mm_extract_epi16(a, imm) \ 4220 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4221 (int)(imm))) 4222 4223 /// Constructs a 128-bit integer vector by first making a copy of the 4224 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4225 /// of an integer parameter into an offset specified by the immediate-value 4226 /// parameter. 4227 /// 4228 /// \headerfile <x86intrin.h> 4229 /// 4230 /// \code 4231 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm); 4232 /// \endcode 4233 /// 4234 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4235 /// 4236 /// \param a 4237 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4238 /// result and then one of the eight elements in the result is replaced by 4239 /// the lower 16 bits of \a b. 4240 /// \param b 4241 /// An integer. The lower 16 bits of this parameter are written to the 4242 /// result beginning at an offset specified by \a imm. 4243 /// \param imm 4244 /// An immediate value specifying the bit offset in the result at which the 4245 /// lower 16 bits of \a b are written. 4246 /// \returns A 128-bit integer vector containing the constructed values. 4247 #define _mm_insert_epi16(a, b, imm) \ 4248 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4249 (int)(imm))) 4250 4251 /// Copies the values of the most significant bits from each 8-bit 4252 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4253 /// value, zero-extends the value, and writes it to the destination. 4254 /// 4255 /// \headerfile <x86intrin.h> 4256 /// 4257 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4258 /// 4259 /// \param __a 4260 /// A 128-bit integer vector containing the values with bits to be extracted. 4261 /// \returns The most significant bits from each 8-bit element in \a __a, 4262 /// written to bits [15:0]. The other bits are assigned zeros. 4263 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { 4264 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4265 } 4266 4267 /// Constructs a 128-bit integer vector by shuffling four 32-bit 4268 /// elements of a 128-bit integer vector parameter, using the immediate-value 4269 /// parameter as a specifier. 4270 /// 4271 /// \headerfile <x86intrin.h> 4272 /// 4273 /// \code 4274 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4275 /// \endcode 4276 /// 4277 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4278 /// 4279 /// \param a 4280 /// A 128-bit integer vector containing the values to be copied. 4281 /// \param imm 4282 /// An immediate value containing an 8-bit value specifying which elements to 4283 /// copy from a. The destinations within the 128-bit destination are assigned 4284 /// values as follows: \n 4285 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4286 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4287 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4288 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4289 /// Bit value assignments: \n 4290 /// 00: assign values from bits [31:0] of \a a. \n 4291 /// 01: assign values from bits [63:32] of \a a. \n 4292 /// 10: assign values from bits [95:64] of \a a. \n 4293 /// 11: assign values from bits [127:96] of \a a. \n 4294 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4295 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4296 /// <c>[b6, b4, b2, b0]</c>. 4297 /// \returns A 128-bit integer vector containing the shuffled values. 4298 #define _mm_shuffle_epi32(a, imm) \ 4299 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 4300 4301 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4302 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4303 /// value parameter as a specifier. 4304 /// 4305 /// \headerfile <x86intrin.h> 4306 /// 4307 /// \code 4308 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4309 /// \endcode 4310 /// 4311 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4312 /// 4313 /// \param a 4314 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4315 /// [127:64] of the result. 4316 /// \param imm 4317 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4318 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4319 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4320 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4321 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4322 /// Bit value assignments: \n 4323 /// 00: assign values from bits [15:0] of \a a. \n 4324 /// 01: assign values from bits [31:16] of \a a. \n 4325 /// 10: assign values from bits [47:32] of \a a. \n 4326 /// 11: assign values from bits [63:48] of \a a. \n 4327 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4328 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4329 /// <c>[b6, b4, b2, b0]</c>. 4330 /// \returns A 128-bit integer vector containing the shuffled values. 4331 #define _mm_shufflelo_epi16(a, imm) \ 4332 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 4333 4334 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4335 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4336 /// value parameter as a specifier. 4337 /// 4338 /// \headerfile <x86intrin.h> 4339 /// 4340 /// \code 4341 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4342 /// \endcode 4343 /// 4344 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4345 /// 4346 /// \param a 4347 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4348 /// [63:0] of the result. 4349 /// \param imm 4350 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4351 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4352 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4353 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4354 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4355 /// Bit value assignments: \n 4356 /// 00: assign values from bits [79:64] of \a a. \n 4357 /// 01: assign values from bits [95:80] of \a a. \n 4358 /// 10: assign values from bits [111:96] of \a a. \n 4359 /// 11: assign values from bits [127:112] of \a a. \n 4360 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4361 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4362 /// <c>[b6, b4, b2, b0]</c>. 4363 /// \returns A 128-bit integer vector containing the shuffled values. 4364 #define _mm_shufflehi_epi16(a, imm) \ 4365 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 4366 4367 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4368 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4369 /// 4370 /// \headerfile <x86intrin.h> 4371 /// 4372 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4373 /// instruction. 4374 /// 4375 /// \param __a 4376 /// A 128-bit vector of [16 x i8]. 4377 /// Bits [71:64] are written to bits [7:0] of the result. \n 4378 /// Bits [79:72] are written to bits [23:16] of the result. \n 4379 /// Bits [87:80] are written to bits [39:32] of the result. \n 4380 /// Bits [95:88] are written to bits [55:48] of the result. \n 4381 /// Bits [103:96] are written to bits [71:64] of the result. \n 4382 /// Bits [111:104] are written to bits [87:80] of the result. \n 4383 /// Bits [119:112] are written to bits [103:96] of the result. \n 4384 /// Bits [127:120] are written to bits [119:112] of the result. 4385 /// \param __b 4386 /// A 128-bit vector of [16 x i8]. \n 4387 /// Bits [71:64] are written to bits [15:8] of the result. \n 4388 /// Bits [79:72] are written to bits [31:24] of the result. \n 4389 /// Bits [87:80] are written to bits [47:40] of the result. \n 4390 /// Bits [95:88] are written to bits [63:56] of the result. \n 4391 /// Bits [103:96] are written to bits [79:72] of the result. \n 4392 /// Bits [111:104] are written to bits [95:88] of the result. \n 4393 /// Bits [119:112] are written to bits [111:104] of the result. \n 4394 /// Bits [127:120] are written to bits [127:120] of the result. 4395 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, 4397 __m128i __b) { 4398 return (__m128i)__builtin_shufflevector( 4399 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 4400 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); 4401 } 4402 4403 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4404 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4405 /// 4406 /// \headerfile <x86intrin.h> 4407 /// 4408 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4409 /// instruction. 4410 /// 4411 /// \param __a 4412 /// A 128-bit vector of [8 x i16]. 4413 /// Bits [79:64] are written to bits [15:0] of the result. \n 4414 /// Bits [95:80] are written to bits [47:32] of the result. \n 4415 /// Bits [111:96] are written to bits [79:64] of the result. \n 4416 /// Bits [127:112] are written to bits [111:96] of the result. 4417 /// \param __b 4418 /// A 128-bit vector of [8 x i16]. 4419 /// Bits [79:64] are written to bits [31:16] of the result. \n 4420 /// Bits [95:80] are written to bits [63:48] of the result. \n 4421 /// Bits [111:96] are written to bits [95:80] of the result. \n 4422 /// Bits [127:112] are written to bits [127:112] of the result. 4423 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4424 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, 4425 __m128i __b) { 4426 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5, 4427 8 + 5, 6, 8 + 6, 7, 8 + 7); 4428 } 4429 4430 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4431 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4432 /// 4433 /// \headerfile <x86intrin.h> 4434 /// 4435 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4436 /// instruction. 4437 /// 4438 /// \param __a 4439 /// A 128-bit vector of [4 x i32]. \n 4440 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4441 /// Bits [127:96] are written to bits [95:64] of the destination. 4442 /// \param __b 4443 /// A 128-bit vector of [4 x i32]. \n 4444 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4445 /// Bits [127:96] are written to bits [127:96] of the destination. 4446 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, 4448 __m128i __b) { 4449 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3, 4450 4 + 3); 4451 } 4452 4453 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4454 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4455 /// 4456 /// \headerfile <x86intrin.h> 4457 /// 4458 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4459 /// instruction. 4460 /// 4461 /// \param __a 4462 /// A 128-bit vector of [2 x i64]. \n 4463 /// Bits [127:64] are written to bits [63:0] of the destination. 4464 /// \param __b 4465 /// A 128-bit vector of [2 x i64]. \n 4466 /// Bits [127:64] are written to bits [127:64] of the destination. 4467 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4468 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, 4469 __m128i __b) { 4470 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1); 4471 } 4472 4473 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4474 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4475 /// 4476 /// \headerfile <x86intrin.h> 4477 /// 4478 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4479 /// instruction. 4480 /// 4481 /// \param __a 4482 /// A 128-bit vector of [16 x i8]. \n 4483 /// Bits [7:0] are written to bits [7:0] of the result. \n 4484 /// Bits [15:8] are written to bits [23:16] of the result. \n 4485 /// Bits [23:16] are written to bits [39:32] of the result. \n 4486 /// Bits [31:24] are written to bits [55:48] of the result. \n 4487 /// Bits [39:32] are written to bits [71:64] of the result. \n 4488 /// Bits [47:40] are written to bits [87:80] of the result. \n 4489 /// Bits [55:48] are written to bits [103:96] of the result. \n 4490 /// Bits [63:56] are written to bits [119:112] of the result. 4491 /// \param __b 4492 /// A 128-bit vector of [16 x i8]. 4493 /// Bits [7:0] are written to bits [15:8] of the result. \n 4494 /// Bits [15:8] are written to bits [31:24] of the result. \n 4495 /// Bits [23:16] are written to bits [47:40] of the result. \n 4496 /// Bits [31:24] are written to bits [63:56] of the result. \n 4497 /// Bits [39:32] are written to bits [79:72] of the result. \n 4498 /// Bits [47:40] are written to bits [95:88] of the result. \n 4499 /// Bits [55:48] are written to bits [111:104] of the result. \n 4500 /// Bits [63:56] are written to bits [127:120] of the result. 4501 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4502 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, 4503 __m128i __b) { 4504 return (__m128i)__builtin_shufflevector( 4505 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 4506 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); 4507 } 4508 4509 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4510 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4511 /// [8 x i16]. 4512 /// 4513 /// \headerfile <x86intrin.h> 4514 /// 4515 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4516 /// instruction. 4517 /// 4518 /// \param __a 4519 /// A 128-bit vector of [8 x i16]. 4520 /// Bits [15:0] are written to bits [15:0] of the result. \n 4521 /// Bits [31:16] are written to bits [47:32] of the result. \n 4522 /// Bits [47:32] are written to bits [79:64] of the result. \n 4523 /// Bits [63:48] are written to bits [111:96] of the result. 4524 /// \param __b 4525 /// A 128-bit vector of [8 x i16]. 4526 /// Bits [15:0] are written to bits [31:16] of the result. \n 4527 /// Bits [31:16] are written to bits [63:48] of the result. \n 4528 /// Bits [47:32] are written to bits [95:80] of the result. \n 4529 /// Bits [63:48] are written to bits [127:112] of the result. 4530 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4531 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, 4532 __m128i __b) { 4533 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1, 4534 8 + 1, 2, 8 + 2, 3, 8 + 3); 4535 } 4536 4537 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4538 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4539 /// 4540 /// \headerfile <x86intrin.h> 4541 /// 4542 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4543 /// instruction. 4544 /// 4545 /// \param __a 4546 /// A 128-bit vector of [4 x i32]. \n 4547 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4548 /// Bits [63:32] are written to bits [95:64] of the destination. 4549 /// \param __b 4550 /// A 128-bit vector of [4 x i32]. \n 4551 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4552 /// Bits [63:32] are written to bits [127:96] of the destination. 4553 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4554 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, 4555 __m128i __b) { 4556 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1, 4557 4 + 1); 4558 } 4559 4560 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4561 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4562 /// 4563 /// \headerfile <x86intrin.h> 4564 /// 4565 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4566 /// instruction. 4567 /// 4568 /// \param __a 4569 /// A 128-bit vector of [2 x i64]. \n 4570 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4571 /// \param __b 4572 /// A 128-bit vector of [2 x i64]. \n 4573 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4574 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, 4576 __m128i __b) { 4577 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0); 4578 } 4579 4580 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4581 /// integer. 4582 /// 4583 /// \headerfile <x86intrin.h> 4584 /// 4585 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4586 /// 4587 /// \param __a 4588 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4589 /// destination. 4590 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4591 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { 4592 return (__m64)__a[0]; 4593 } 4594 4595 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4596 /// upper bits. 4597 /// 4598 /// \headerfile <x86intrin.h> 4599 /// 4600 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4601 /// 4602 /// \param __a 4603 /// A 64-bit value. 4604 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4605 /// the operand. The upper 64 bits are assigned zeros. 4606 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { 4607 return __extension__(__m128i)(__v2di){(long long)__a, 0}; 4608 } 4609 4610 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4611 /// integer vector, zeroing the upper bits. 4612 /// 4613 /// \headerfile <x86intrin.h> 4614 /// 4615 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4616 /// 4617 /// \param __a 4618 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4619 /// destination. 4620 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4621 /// the operand. The upper 64 bits are assigned zeros. 4622 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { 4623 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4624 } 4625 4626 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4627 /// [2 x double] and interleaves them into a 128-bit vector of [2 x 4628 /// double]. 4629 /// 4630 /// \headerfile <x86intrin.h> 4631 /// 4632 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4633 /// 4634 /// \param __a 4635 /// A 128-bit vector of [2 x double]. \n 4636 /// Bits [127:64] are written to bits [63:0] of the destination. 4637 /// \param __b 4638 /// A 128-bit vector of [2 x double]. \n 4639 /// Bits [127:64] are written to bits [127:64] of the destination. 4640 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4641 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, 4642 __m128d __b) { 4643 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1); 4644 } 4645 4646 /// Unpacks the low-order 64-bit elements from two 128-bit vectors 4647 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4648 /// double]. 4649 /// 4650 /// \headerfile <x86intrin.h> 4651 /// 4652 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4653 /// 4654 /// \param __a 4655 /// A 128-bit vector of [2 x double]. \n 4656 /// Bits [63:0] are written to bits [63:0] of the destination. 4657 /// \param __b 4658 /// A 128-bit vector of [2 x double]. \n 4659 /// Bits [63:0] are written to bits [127:64] of the destination. 4660 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, 4662 __m128d __b) { 4663 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0); 4664 } 4665 4666 /// Extracts the sign bits of the double-precision values in the 128-bit 4667 /// vector of [2 x double], zero-extends the value, and writes it to the 4668 /// low-order bits of the destination. 4669 /// 4670 /// \headerfile <x86intrin.h> 4671 /// 4672 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4673 /// 4674 /// \param __a 4675 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4676 /// be extracted. 4677 /// \returns The sign bits from each of the double-precision elements in \a __a, 4678 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4679 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { 4680 return __builtin_ia32_movmskpd((__v2df)__a); 4681 } 4682 4683 /// Constructs a 128-bit floating-point vector of [2 x double] from two 4684 /// 128-bit vector parameters of [2 x double], using the immediate-value 4685 /// parameter as a specifier. 4686 /// 4687 /// \headerfile <x86intrin.h> 4688 /// 4689 /// \code 4690 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4691 /// \endcode 4692 /// 4693 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4694 /// 4695 /// \param a 4696 /// A 128-bit vector of [2 x double]. 4697 /// \param b 4698 /// A 128-bit vector of [2 x double]. 4699 /// \param i 4700 /// An 8-bit immediate value. The least significant two bits specify which 4701 /// elements to copy from \a a and \a b: \n 4702 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4703 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4704 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4705 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4706 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro. 4707 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form 4708 /// <c>[b1, b0]</c>. 4709 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4710 #define _mm_shuffle_pd(a, b, i) \ 4711 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4712 (int)(i))) 4713 4714 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4715 /// floating-point vector of [4 x float]. 4716 /// 4717 /// \headerfile <x86intrin.h> 4718 /// 4719 /// This intrinsic has no corresponding instruction. 4720 /// 4721 /// \param __a 4722 /// A 128-bit floating-point vector of [2 x double]. 4723 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4724 /// bitwise pattern as the parameter. 4725 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { 4726 return (__m128)__a; 4727 } 4728 4729 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4730 /// integer vector. 4731 /// 4732 /// \headerfile <x86intrin.h> 4733 /// 4734 /// This intrinsic has no corresponding instruction. 4735 /// 4736 /// \param __a 4737 /// A 128-bit floating-point vector of [2 x double]. 4738 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4739 /// parameter. 4740 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { 4741 return (__m128i)__a; 4742 } 4743 4744 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4745 /// floating-point vector of [2 x double]. 4746 /// 4747 /// \headerfile <x86intrin.h> 4748 /// 4749 /// This intrinsic has no corresponding instruction. 4750 /// 4751 /// \param __a 4752 /// A 128-bit floating-point vector of [4 x float]. 4753 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4754 /// bitwise pattern as the parameter. 4755 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { 4756 return (__m128d)__a; 4757 } 4758 4759 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4760 /// integer vector. 4761 /// 4762 /// \headerfile <x86intrin.h> 4763 /// 4764 /// This intrinsic has no corresponding instruction. 4765 /// 4766 /// \param __a 4767 /// A 128-bit floating-point vector of [4 x float]. 4768 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4769 /// parameter. 4770 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { 4771 return (__m128i)__a; 4772 } 4773 4774 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4775 /// of [4 x float]. 4776 /// 4777 /// \headerfile <x86intrin.h> 4778 /// 4779 /// This intrinsic has no corresponding instruction. 4780 /// 4781 /// \param __a 4782 /// A 128-bit integer vector. 4783 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4784 /// bitwise pattern as the parameter. 4785 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { 4786 return (__m128)__a; 4787 } 4788 4789 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4790 /// of [2 x double]. 4791 /// 4792 /// \headerfile <x86intrin.h> 4793 /// 4794 /// This intrinsic has no corresponding instruction. 4795 /// 4796 /// \param __a 4797 /// A 128-bit integer vector. 4798 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4799 /// bitwise pattern as the parameter. 4800 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) { 4801 return (__m128d)__a; 4802 } 4803 4804 /// Compares each of the corresponding double-precision values of two 4805 /// 128-bit vectors of [2 x double], using the operation specified by the 4806 /// immediate integer operand. 4807 /// 4808 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 4809 /// If either value in a comparison is NaN, comparisons that are ordered 4810 /// return false, and comparisons that are unordered return true. 4811 /// 4812 /// \headerfile <x86intrin.h> 4813 /// 4814 /// \code 4815 /// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); 4816 /// \endcode 4817 /// 4818 /// This intrinsic corresponds to the <c> (V)CMPPD </c> instruction. 4819 /// 4820 /// \param a 4821 /// A 128-bit vector of [2 x double]. 4822 /// \param b 4823 /// A 128-bit vector of [2 x double]. 4824 /// \param c 4825 /// An immediate integer operand, with bits [4:0] specifying which comparison 4826 /// operation to use: \n 4827 /// 0x00: Equal (ordered, non-signaling) \n 4828 /// 0x01: Less-than (ordered, signaling) \n 4829 /// 0x02: Less-than-or-equal (ordered, signaling) \n 4830 /// 0x03: Unordered (non-signaling) \n 4831 /// 0x04: Not-equal (unordered, non-signaling) \n 4832 /// 0x05: Not-less-than (unordered, signaling) \n 4833 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 4834 /// 0x07: Ordered (non-signaling) \n 4835 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 4836 #define _mm_cmp_pd(a, b, c) \ 4837 ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4838 (c))) 4839 4840 /// Compares each of the corresponding scalar double-precision values of 4841 /// two 128-bit vectors of [2 x double], using the operation specified by the 4842 /// immediate integer operand. 4843 /// 4844 /// Each comparison returns 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 4845 /// If either value in a comparison is NaN, comparisons that are ordered 4846 /// return false, and comparisons that are unordered return true. 4847 /// 4848 /// \headerfile <x86intrin.h> 4849 /// 4850 /// \code 4851 /// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); 4852 /// \endcode 4853 /// 4854 /// This intrinsic corresponds to the <c> (V)CMPSD </c> instruction. 4855 /// 4856 /// \param a 4857 /// A 128-bit vector of [2 x double]. 4858 /// \param b 4859 /// A 128-bit vector of [2 x double]. 4860 /// \param c 4861 /// An immediate integer operand, with bits [4:0] specifying which comparison 4862 /// operation to use: \n 4863 /// 0x00: Equal (ordered, non-signaling) \n 4864 /// 0x01: Less-than (ordered, signaling) \n 4865 /// 0x02: Less-than-or-equal (ordered, signaling) \n 4866 /// 0x03: Unordered (non-signaling) \n 4867 /// 0x04: Not-equal (unordered, non-signaling) \n 4868 /// 0x05: Not-less-than (unordered, signaling) \n 4869 /// 0x06: Not-less-than-or-equal (unordered, signaling) \n 4870 /// 0x07: Ordered (non-signaling) \n 4871 /// \returns A 128-bit vector of [2 x double] containing the comparison results. 4872 #define _mm_cmp_sd(a, b, c) \ 4873 ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4874 (c))) 4875 4876 #if defined(__cplusplus) 4877 extern "C" { 4878 #endif 4879 4880 /// Indicates that a spin loop is being executed for the purposes of 4881 /// optimizing power consumption during the loop. 4882 /// 4883 /// \headerfile <x86intrin.h> 4884 /// 4885 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4886 /// 4887 void _mm_pause(void); 4888 4889 #if defined(__cplusplus) 4890 } // extern "C" 4891 #endif 4892 #undef __DEFAULT_FN_ATTRS 4893 #undef __DEFAULT_FN_ATTRS_MMX 4894 4895 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4896 4897 #define _MM_DENORMALS_ZERO_ON (0x0040U) 4898 #define _MM_DENORMALS_ZERO_OFF (0x0000U) 4899 4900 #define _MM_DENORMALS_ZERO_MASK (0x0040U) 4901 4902 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4903 #define _MM_SET_DENORMALS_ZERO_MODE(x) \ 4904 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4905 4906 #endif /* __EMMINTRIN_H */ 4907