1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __EMMINTRIN_H 11 #define __EMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <xmmintrin.h> 18 19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 21 22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 23 typedef long long __m128i_u 24 __attribute__((__vector_size__(16), __aligned__(1))); 25 26 /* Type defines. */ 27 typedef double __v2df __attribute__((__vector_size__(16))); 28 typedef long long __v2di __attribute__((__vector_size__(16))); 29 typedef short __v8hi __attribute__((__vector_size__(16))); 30 typedef char __v16qi __attribute__((__vector_size__(16))); 31 32 /* Unsigned types */ 33 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 34 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 35 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 36 37 /* We need an explicitly signed variant for char. Note that this shouldn't 38 * appear in the interface though. */ 39 typedef signed char __v16qs __attribute__((__vector_size__(16))); 40 41 #ifdef __SSE2__ 42 /* Both _Float16 and __bf16 require SSE2 being enabled. */ 43 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); 44 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); 45 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); 46 47 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16))); 48 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); 49 #endif 50 51 /* Define the default attributes for the functions in this file. */ 52 #define __DEFAULT_FN_ATTRS \ 53 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ 54 __min_vector_width__(128))) 55 #define __DEFAULT_FN_ATTRS_MMX \ 56 __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \ 57 __min_vector_width__(64))) 58 59 /// Adds lower double-precision values in both operands and returns the 60 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 61 /// are copied from the upper double-precision value of the first operand. 62 /// 63 /// \headerfile <x86intrin.h> 64 /// 65 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 66 /// 67 /// \param __a 68 /// A 128-bit vector of [2 x double] containing one of the source operands. 69 /// \param __b 70 /// A 128-bit vector of [2 x double] containing one of the source operands. 71 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 72 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 73 /// from the upper 64 bits of the first source operand. 74 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, 75 __m128d __b) { 76 __a[0] += __b[0]; 77 return __a; 78 } 79 80 /// Adds two 128-bit vectors of [2 x double]. 81 /// 82 /// \headerfile <x86intrin.h> 83 /// 84 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 85 /// 86 /// \param __a 87 /// A 128-bit vector of [2 x double] containing one of the source operands. 88 /// \param __b 89 /// A 128-bit vector of [2 x double] containing one of the source operands. 90 /// \returns A 128-bit vector of [2 x double] containing the sums of both 91 /// operands. 92 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, 93 __m128d __b) { 94 return (__m128d)((__v2df)__a + (__v2df)__b); 95 } 96 97 /// Subtracts the lower double-precision value of the second operand 98 /// from the lower double-precision value of the first operand and returns 99 /// the difference in the lower 64 bits of the result. The upper 64 bits of 100 /// the result are copied from the upper double-precision value of the first 101 /// operand. 102 /// 103 /// \headerfile <x86intrin.h> 104 /// 105 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 106 /// 107 /// \param __a 108 /// A 128-bit vector of [2 x double] containing the minuend. 109 /// \param __b 110 /// A 128-bit vector of [2 x double] containing the subtrahend. 111 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 112 /// difference of the lower 64 bits of both operands. The upper 64 bits are 113 /// copied from the upper 64 bits of the first source operand. 114 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, 115 __m128d __b) { 116 __a[0] -= __b[0]; 117 return __a; 118 } 119 120 /// Subtracts two 128-bit vectors of [2 x double]. 121 /// 122 /// \headerfile <x86intrin.h> 123 /// 124 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 125 /// 126 /// \param __a 127 /// A 128-bit vector of [2 x double] containing the minuend. 128 /// \param __b 129 /// A 128-bit vector of [2 x double] containing the subtrahend. 130 /// \returns A 128-bit vector of [2 x double] containing the differences between 131 /// both operands. 132 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, 133 __m128d __b) { 134 return (__m128d)((__v2df)__a - (__v2df)__b); 135 } 136 137 /// Multiplies lower double-precision values in both operands and returns 138 /// the product in the lower 64 bits of the result. The upper 64 bits of the 139 /// result are copied from the upper double-precision value of the first 140 /// operand. 141 /// 142 /// \headerfile <x86intrin.h> 143 /// 144 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 145 /// 146 /// \param __a 147 /// A 128-bit vector of [2 x double] containing one of the source operands. 148 /// \param __b 149 /// A 128-bit vector of [2 x double] containing one of the source operands. 150 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 151 /// product of the lower 64 bits of both operands. The upper 64 bits are 152 /// copied from the upper 64 bits of the first source operand. 153 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, 154 __m128d __b) { 155 __a[0] *= __b[0]; 156 return __a; 157 } 158 159 /// Multiplies two 128-bit vectors of [2 x double]. 160 /// 161 /// \headerfile <x86intrin.h> 162 /// 163 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 164 /// 165 /// \param __a 166 /// A 128-bit vector of [2 x double] containing one of the operands. 167 /// \param __b 168 /// A 128-bit vector of [2 x double] containing one of the operands. 169 /// \returns A 128-bit vector of [2 x double] containing the products of both 170 /// operands. 171 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, 172 __m128d __b) { 173 return (__m128d)((__v2df)__a * (__v2df)__b); 174 } 175 176 /// Divides the lower double-precision value of the first operand by the 177 /// lower double-precision value of the second operand and returns the 178 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 179 /// result are copied from the upper double-precision value of the first 180 /// operand. 181 /// 182 /// \headerfile <x86intrin.h> 183 /// 184 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 185 /// 186 /// \param __a 187 /// A 128-bit vector of [2 x double] containing the dividend. 188 /// \param __b 189 /// A 128-bit vector of [2 x double] containing divisor. 190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 191 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 192 /// copied from the upper 64 bits of the first source operand. 193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, 194 __m128d __b) { 195 __a[0] /= __b[0]; 196 return __a; 197 } 198 199 /// Performs an element-by-element division of two 128-bit vectors of 200 /// [2 x double]. 201 /// 202 /// \headerfile <x86intrin.h> 203 /// 204 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 205 /// 206 /// \param __a 207 /// A 128-bit vector of [2 x double] containing the dividend. 208 /// \param __b 209 /// A 128-bit vector of [2 x double] containing the divisor. 210 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 211 /// operands. 212 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, 213 __m128d __b) { 214 return (__m128d)((__v2df)__a / (__v2df)__b); 215 } 216 217 /// Calculates the square root of the lower double-precision value of 218 /// the second operand and returns it in the lower 64 bits of the result. 219 /// The upper 64 bits of the result are copied from the upper 220 /// double-precision value of the first operand. 221 /// 222 /// \headerfile <x86intrin.h> 223 /// 224 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 225 /// 226 /// \param __a 227 /// A 128-bit vector of [2 x double] containing one of the operands. The 228 /// upper 64 bits of this operand are copied to the upper 64 bits of the 229 /// result. 230 /// \param __b 231 /// A 128-bit vector of [2 x double] containing one of the operands. The 232 /// square root is calculated using the lower 64 bits of this operand. 233 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 234 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 235 /// bits are copied from the upper 64 bits of operand \a __a. 236 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, 237 __m128d __b) { 238 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 239 return __extension__(__m128d){__c[0], __a[1]}; 240 } 241 242 /// Calculates the square root of the each of two values stored in a 243 /// 128-bit vector of [2 x double]. 244 /// 245 /// \headerfile <x86intrin.h> 246 /// 247 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 248 /// 249 /// \param __a 250 /// A 128-bit vector of [2 x double]. 251 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 252 /// values in the operand. 253 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { 254 return __builtin_ia32_sqrtpd((__v2df)__a); 255 } 256 257 /// Compares lower 64-bit double-precision values of both operands, and 258 /// returns the lesser of the pair of values in the lower 64-bits of the 259 /// result. The upper 64 bits of the result are copied from the upper 260 /// double-precision value of the first operand. 261 /// 262 /// \headerfile <x86intrin.h> 263 /// 264 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 265 /// 266 /// \param __a 267 /// A 128-bit vector of [2 x double] containing one of the operands. The 268 /// lower 64 bits of this operand are used in the comparison. 269 /// \param __b 270 /// A 128-bit vector of [2 x double] containing one of the operands. The 271 /// lower 64 bits of this operand are used in the comparison. 272 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 273 /// minimum value between both operands. The upper 64 bits are copied from 274 /// the upper 64 bits of the first source operand. 275 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, 276 __m128d __b) { 277 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 278 } 279 280 /// Performs element-by-element comparison of the two 128-bit vectors of 281 /// [2 x double] and returns the vector containing the lesser of each pair of 282 /// values. 283 /// 284 /// \headerfile <x86intrin.h> 285 /// 286 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 287 /// 288 /// \param __a 289 /// A 128-bit vector of [2 x double] containing one of the operands. 290 /// \param __b 291 /// A 128-bit vector of [2 x double] containing one of the operands. 292 /// \returns A 128-bit vector of [2 x double] containing the minimum values 293 /// between both operands. 294 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, 295 __m128d __b) { 296 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 297 } 298 299 /// Compares lower 64-bit double-precision values of both operands, and 300 /// returns the greater of the pair of values in the lower 64-bits of the 301 /// result. The upper 64 bits of the result are copied from the upper 302 /// double-precision value of the first operand. 303 /// 304 /// \headerfile <x86intrin.h> 305 /// 306 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 307 /// 308 /// \param __a 309 /// A 128-bit vector of [2 x double] containing one of the operands. The 310 /// lower 64 bits of this operand are used in the comparison. 311 /// \param __b 312 /// A 128-bit vector of [2 x double] containing one of the operands. The 313 /// lower 64 bits of this operand are used in the comparison. 314 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 315 /// maximum value between both operands. The upper 64 bits are copied from 316 /// the upper 64 bits of the first source operand. 317 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, 318 __m128d __b) { 319 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 320 } 321 322 /// Performs element-by-element comparison of the two 128-bit vectors of 323 /// [2 x double] and returns the vector containing the greater of each pair 324 /// of values. 325 /// 326 /// \headerfile <x86intrin.h> 327 /// 328 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 329 /// 330 /// \param __a 331 /// A 128-bit vector of [2 x double] containing one of the operands. 332 /// \param __b 333 /// A 128-bit vector of [2 x double] containing one of the operands. 334 /// \returns A 128-bit vector of [2 x double] containing the maximum values 335 /// between both operands. 336 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, 337 __m128d __b) { 338 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 339 } 340 341 /// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 342 /// 343 /// \headerfile <x86intrin.h> 344 /// 345 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 346 /// 347 /// \param __a 348 /// A 128-bit vector of [2 x double] containing one of the source operands. 349 /// \param __b 350 /// A 128-bit vector of [2 x double] containing one of the source operands. 351 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 352 /// values between both operands. 353 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, 354 __m128d __b) { 355 return (__m128d)((__v2du)__a & (__v2du)__b); 356 } 357 358 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 359 /// the one's complement of the values contained in the first source operand. 360 /// 361 /// \headerfile <x86intrin.h> 362 /// 363 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 364 /// 365 /// \param __a 366 /// A 128-bit vector of [2 x double] containing the left source operand. The 367 /// one's complement of this value is used in the bitwise AND. 368 /// \param __b 369 /// A 128-bit vector of [2 x double] containing the right source operand. 370 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 371 /// values in the second operand and the one's complement of the first 372 /// operand. 373 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, 374 __m128d __b) { 375 return (__m128d)(~(__v2du)__a & (__v2du)__b); 376 } 377 378 /// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 379 /// 380 /// \headerfile <x86intrin.h> 381 /// 382 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 383 /// 384 /// \param __a 385 /// A 128-bit vector of [2 x double] containing one of the source operands. 386 /// \param __b 387 /// A 128-bit vector of [2 x double] containing one of the source operands. 388 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 389 /// values between both operands. 390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, 391 __m128d __b) { 392 return (__m128d)((__v2du)__a | (__v2du)__b); 393 } 394 395 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 396 /// 397 /// \headerfile <x86intrin.h> 398 /// 399 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 400 /// 401 /// \param __a 402 /// A 128-bit vector of [2 x double] containing one of the source operands. 403 /// \param __b 404 /// A 128-bit vector of [2 x double] containing one of the source operands. 405 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 406 /// values between both operands. 407 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, 408 __m128d __b) { 409 return (__m128d)((__v2du)__a ^ (__v2du)__b); 410 } 411 412 /// Compares each of the corresponding double-precision values of the 413 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 414 /// for false, 0xFFFFFFFFFFFFFFFF for true. 415 /// 416 /// \headerfile <x86intrin.h> 417 /// 418 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 419 /// 420 /// \param __a 421 /// A 128-bit vector of [2 x double]. 422 /// \param __b 423 /// A 128-bit vector of [2 x double]. 424 /// \returns A 128-bit vector containing the comparison results. 425 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, 426 __m128d __b) { 427 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 428 } 429 430 /// Compares each of the corresponding double-precision values of the 431 /// 128-bit vectors of [2 x double] to determine if the values in the first 432 /// operand are less than those in the second operand. Each comparison 433 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 434 /// 435 /// \headerfile <x86intrin.h> 436 /// 437 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 438 /// 439 /// \param __a 440 /// A 128-bit vector of [2 x double]. 441 /// \param __b 442 /// A 128-bit vector of [2 x double]. 443 /// \returns A 128-bit vector containing the comparison results. 444 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, 445 __m128d __b) { 446 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 447 } 448 449 /// Compares each of the corresponding double-precision values of the 450 /// 128-bit vectors of [2 x double] to determine if the values in the first 451 /// operand are less than or equal to those in the second operand. 452 /// 453 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 454 /// 455 /// \headerfile <x86intrin.h> 456 /// 457 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 458 /// 459 /// \param __a 460 /// A 128-bit vector of [2 x double]. 461 /// \param __b 462 /// A 128-bit vector of [2 x double]. 463 /// \returns A 128-bit vector containing the comparison results. 464 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, 465 __m128d __b) { 466 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 467 } 468 469 /// Compares each of the corresponding double-precision values of the 470 /// 128-bit vectors of [2 x double] to determine if the values in the first 471 /// operand are greater than those in the second operand. 472 /// 473 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 474 /// 475 /// \headerfile <x86intrin.h> 476 /// 477 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 478 /// 479 /// \param __a 480 /// A 128-bit vector of [2 x double]. 481 /// \param __b 482 /// A 128-bit vector of [2 x double]. 483 /// \returns A 128-bit vector containing the comparison results. 484 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, 485 __m128d __b) { 486 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 487 } 488 489 /// Compares each of the corresponding double-precision values of the 490 /// 128-bit vectors of [2 x double] to determine if the values in the first 491 /// operand are greater than or equal to those in the second operand. 492 /// 493 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 494 /// 495 /// \headerfile <x86intrin.h> 496 /// 497 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 498 /// 499 /// \param __a 500 /// A 128-bit vector of [2 x double]. 501 /// \param __b 502 /// A 128-bit vector of [2 x double]. 503 /// \returns A 128-bit vector containing the comparison results. 504 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, 505 __m128d __b) { 506 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 507 } 508 509 /// Compares each of the corresponding double-precision values of the 510 /// 128-bit vectors of [2 x double] to determine if the values in the first 511 /// operand are ordered with respect to those in the second operand. 512 /// 513 /// A pair of double-precision values are "ordered" with respect to each 514 /// other if neither value is a NaN. Each comparison yields 0x0 for false, 515 /// 0xFFFFFFFFFFFFFFFF for true. 516 /// 517 /// \headerfile <x86intrin.h> 518 /// 519 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 520 /// 521 /// \param __a 522 /// A 128-bit vector of [2 x double]. 523 /// \param __b 524 /// A 128-bit vector of [2 x double]. 525 /// \returns A 128-bit vector containing the comparison results. 526 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, 527 __m128d __b) { 528 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 529 } 530 531 /// Compares each of the corresponding double-precision values of the 532 /// 128-bit vectors of [2 x double] to determine if the values in the first 533 /// operand are unordered with respect to those in the second operand. 534 /// 535 /// A pair of double-precision values are "unordered" with respect to each 536 /// other if one or both values are NaN. Each comparison yields 0x0 for 537 /// false, 0xFFFFFFFFFFFFFFFF for true. 538 /// 539 /// \headerfile <x86intrin.h> 540 /// 541 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 542 /// instruction. 543 /// 544 /// \param __a 545 /// A 128-bit vector of [2 x double]. 546 /// \param __b 547 /// A 128-bit vector of [2 x double]. 548 /// \returns A 128-bit vector containing the comparison results. 549 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, 550 __m128d __b) { 551 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 552 } 553 554 /// Compares each of the corresponding double-precision values of the 555 /// 128-bit vectors of [2 x double] to determine if the values in the first 556 /// operand are unequal to those in the second operand. 557 /// 558 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 559 /// 560 /// \headerfile <x86intrin.h> 561 /// 562 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 563 /// 564 /// \param __a 565 /// A 128-bit vector of [2 x double]. 566 /// \param __b 567 /// A 128-bit vector of [2 x double]. 568 /// \returns A 128-bit vector containing the comparison results. 569 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, 570 __m128d __b) { 571 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 572 } 573 574 /// Compares each of the corresponding double-precision values of the 575 /// 128-bit vectors of [2 x double] to determine if the values in the first 576 /// operand are not less than those in the second operand. 577 /// 578 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 579 /// 580 /// \headerfile <x86intrin.h> 581 /// 582 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 583 /// 584 /// \param __a 585 /// A 128-bit vector of [2 x double]. 586 /// \param __b 587 /// A 128-bit vector of [2 x double]. 588 /// \returns A 128-bit vector containing the comparison results. 589 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, 590 __m128d __b) { 591 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 592 } 593 594 /// Compares each of the corresponding double-precision values of the 595 /// 128-bit vectors of [2 x double] to determine if the values in the first 596 /// operand are not less than or equal to those in the second operand. 597 /// 598 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 599 /// 600 /// \headerfile <x86intrin.h> 601 /// 602 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 603 /// 604 /// \param __a 605 /// A 128-bit vector of [2 x double]. 606 /// \param __b 607 /// A 128-bit vector of [2 x double]. 608 /// \returns A 128-bit vector containing the comparison results. 609 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, 610 __m128d __b) { 611 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 612 } 613 614 /// Compares each of the corresponding double-precision values of the 615 /// 128-bit vectors of [2 x double] to determine if the values in the first 616 /// operand are not greater than those in the second operand. 617 /// 618 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 619 /// 620 /// \headerfile <x86intrin.h> 621 /// 622 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 623 /// 624 /// \param __a 625 /// A 128-bit vector of [2 x double]. 626 /// \param __b 627 /// A 128-bit vector of [2 x double]. 628 /// \returns A 128-bit vector containing the comparison results. 629 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, 630 __m128d __b) { 631 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 632 } 633 634 /// Compares each of the corresponding double-precision values of the 635 /// 128-bit vectors of [2 x double] to determine if the values in the first 636 /// operand are not greater than or equal to those in the second operand. 637 /// 638 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 639 /// 640 /// \headerfile <x86intrin.h> 641 /// 642 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 643 /// 644 /// \param __a 645 /// A 128-bit vector of [2 x double]. 646 /// \param __b 647 /// A 128-bit vector of [2 x double]. 648 /// \returns A 128-bit vector containing the comparison results. 649 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, 650 __m128d __b) { 651 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 652 } 653 654 /// Compares the lower double-precision floating-point values in each of 655 /// the two 128-bit floating-point vectors of [2 x double] for equality. 656 /// 657 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 658 /// 659 /// \headerfile <x86intrin.h> 660 /// 661 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 662 /// 663 /// \param __a 664 /// A 128-bit vector of [2 x double]. The lower double-precision value is 665 /// compared to the lower double-precision value of \a __b. 666 /// \param __b 667 /// A 128-bit vector of [2 x double]. The lower double-precision value is 668 /// compared to the lower double-precision value of \a __a. 669 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 670 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 671 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, 672 __m128d __b) { 673 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 674 } 675 676 /// Compares the lower double-precision floating-point values in each of 677 /// the two 128-bit floating-point vectors of [2 x double] to determine if 678 /// the value in the first parameter is less than the corresponding value in 679 /// the second parameter. 680 /// 681 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 682 /// 683 /// \headerfile <x86intrin.h> 684 /// 685 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 686 /// 687 /// \param __a 688 /// A 128-bit vector of [2 x double]. The lower double-precision value is 689 /// compared to the lower double-precision value of \a __b. 690 /// \param __b 691 /// A 128-bit vector of [2 x double]. The lower double-precision value is 692 /// compared to the lower double-precision value of \a __a. 693 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 694 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 695 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, 696 __m128d __b) { 697 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 698 } 699 700 /// Compares the lower double-precision floating-point values in each of 701 /// the two 128-bit floating-point vectors of [2 x double] to determine if 702 /// the value in the first parameter is less than or equal to the 703 /// corresponding value in the second parameter. 704 /// 705 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 706 /// 707 /// \headerfile <x86intrin.h> 708 /// 709 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 710 /// 711 /// \param __a 712 /// A 128-bit vector of [2 x double]. The lower double-precision value is 713 /// compared to the lower double-precision value of \a __b. 714 /// \param __b 715 /// A 128-bit vector of [2 x double]. The lower double-precision value is 716 /// compared to the lower double-precision value of \a __a. 717 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 718 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, 720 __m128d __b) { 721 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 722 } 723 724 /// Compares the lower double-precision floating-point values in each of 725 /// the two 128-bit floating-point vectors of [2 x double] to determine if 726 /// the value in the first parameter is greater than the corresponding value 727 /// in the second parameter. 728 /// 729 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 730 /// 731 /// \headerfile <x86intrin.h> 732 /// 733 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 734 /// 735 /// \param __a 736 /// A 128-bit vector of [2 x double]. The lower double-precision value is 737 /// compared to the lower double-precision value of \a __b. 738 /// \param __b 739 /// A 128-bit vector of [2 x double]. The lower double-precision value is 740 /// compared to the lower double-precision value of \a __a. 741 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 742 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, 744 __m128d __b) { 745 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 746 return __extension__(__m128d){__c[0], __a[1]}; 747 } 748 749 /// Compares the lower double-precision floating-point values in each of 750 /// the two 128-bit floating-point vectors of [2 x double] to determine if 751 /// the value in the first parameter is greater than or equal to the 752 /// corresponding value in the second parameter. 753 /// 754 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 755 /// 756 /// \headerfile <x86intrin.h> 757 /// 758 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 759 /// 760 /// \param __a 761 /// A 128-bit vector of [2 x double]. The lower double-precision value is 762 /// compared to the lower double-precision value of \a __b. 763 /// \param __b 764 /// A 128-bit vector of [2 x double]. The lower double-precision value is 765 /// compared to the lower double-precision value of \a __a. 766 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 767 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 768 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, 769 __m128d __b) { 770 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 771 return __extension__(__m128d){__c[0], __a[1]}; 772 } 773 774 /// Compares the lower double-precision floating-point values in each of 775 /// the two 128-bit floating-point vectors of [2 x double] to determine if 776 /// the value in the first parameter is "ordered" with respect to the 777 /// corresponding value in the second parameter. 778 /// 779 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 780 /// of double-precision values are "ordered" with respect to each other if 781 /// neither value is a NaN. 782 /// 783 /// \headerfile <x86intrin.h> 784 /// 785 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 786 /// 787 /// \param __a 788 /// A 128-bit vector of [2 x double]. The lower double-precision value is 789 /// compared to the lower double-precision value of \a __b. 790 /// \param __b 791 /// A 128-bit vector of [2 x double]. The lower double-precision value is 792 /// compared to the lower double-precision value of \a __a. 793 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 794 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 795 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, 796 __m128d __b) { 797 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 798 } 799 800 /// Compares the lower double-precision floating-point values in each of 801 /// the two 128-bit floating-point vectors of [2 x double] to determine if 802 /// the value in the first parameter is "unordered" with respect to the 803 /// corresponding value in the second parameter. 804 /// 805 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 806 /// of double-precision values are "unordered" with respect to each other if 807 /// one or both values are NaN. 808 /// 809 /// \headerfile <x86intrin.h> 810 /// 811 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 812 /// instruction. 813 /// 814 /// \param __a 815 /// A 128-bit vector of [2 x double]. The lower double-precision value is 816 /// compared to the lower double-precision value of \a __b. 817 /// \param __b 818 /// A 128-bit vector of [2 x double]. The lower double-precision value is 819 /// compared to the lower double-precision value of \a __a. 820 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 821 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 822 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, 823 __m128d __b) { 824 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 825 } 826 827 /// Compares the lower double-precision floating-point values in each of 828 /// the two 128-bit floating-point vectors of [2 x double] to determine if 829 /// the value in the first parameter is unequal to the corresponding value in 830 /// the second parameter. 831 /// 832 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 833 /// 834 /// \headerfile <x86intrin.h> 835 /// 836 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 837 /// 838 /// \param __a 839 /// A 128-bit vector of [2 x double]. The lower double-precision value is 840 /// compared to the lower double-precision value of \a __b. 841 /// \param __b 842 /// A 128-bit vector of [2 x double]. The lower double-precision value is 843 /// compared to the lower double-precision value of \a __a. 844 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 845 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 846 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, 847 __m128d __b) { 848 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 849 } 850 851 /// Compares the lower double-precision floating-point values in each of 852 /// the two 128-bit floating-point vectors of [2 x double] to determine if 853 /// the value in the first parameter is not less than the corresponding 854 /// value in the second parameter. 855 /// 856 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 857 /// 858 /// \headerfile <x86intrin.h> 859 /// 860 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 861 /// 862 /// \param __a 863 /// A 128-bit vector of [2 x double]. The lower double-precision value is 864 /// compared to the lower double-precision value of \a __b. 865 /// \param __b 866 /// A 128-bit vector of [2 x double]. The lower double-precision value is 867 /// compared to the lower double-precision value of \a __a. 868 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 869 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 870 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, 871 __m128d __b) { 872 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 873 } 874 875 /// Compares the lower double-precision floating-point values in each of 876 /// the two 128-bit floating-point vectors of [2 x double] to determine if 877 /// the value in the first parameter is not less than or equal to the 878 /// corresponding value in the second parameter. 879 /// 880 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 881 /// 882 /// \headerfile <x86intrin.h> 883 /// 884 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 885 /// 886 /// \param __a 887 /// A 128-bit vector of [2 x double]. The lower double-precision value is 888 /// compared to the lower double-precision value of \a __b. 889 /// \param __b 890 /// A 128-bit vector of [2 x double]. The lower double-precision value is 891 /// compared to the lower double-precision value of \a __a. 892 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 893 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 894 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, 895 __m128d __b) { 896 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 897 } 898 899 /// Compares the lower double-precision floating-point values in each of 900 /// the two 128-bit floating-point vectors of [2 x double] to determine if 901 /// the value in the first parameter is not greater than the corresponding 902 /// value in the second parameter. 903 /// 904 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 905 /// 906 /// \headerfile <x86intrin.h> 907 /// 908 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 909 /// 910 /// \param __a 911 /// A 128-bit vector of [2 x double]. The lower double-precision value is 912 /// compared to the lower double-precision value of \a __b. 913 /// \param __b 914 /// A 128-bit vector of [2 x double]. The lower double-precision value is 915 /// compared to the lower double-precision value of \a __a. 916 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 917 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 918 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, 919 __m128d __b) { 920 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 921 return __extension__(__m128d){__c[0], __a[1]}; 922 } 923 924 /// Compares the lower double-precision floating-point values in each of 925 /// the two 128-bit floating-point vectors of [2 x double] to determine if 926 /// the value in the first parameter is not greater than or equal to the 927 /// corresponding value in the second parameter. 928 /// 929 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 930 /// 931 /// \headerfile <x86intrin.h> 932 /// 933 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 934 /// 935 /// \param __a 936 /// A 128-bit vector of [2 x double]. The lower double-precision value is 937 /// compared to the lower double-precision value of \a __b. 938 /// \param __b 939 /// A 128-bit vector of [2 x double]. The lower double-precision value is 940 /// compared to the lower double-precision value of \a __a. 941 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 942 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 943 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, 944 __m128d __b) { 945 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 946 return __extension__(__m128d){__c[0], __a[1]}; 947 } 948 949 /// Compares the lower double-precision floating-point values in each of 950 /// the two 128-bit floating-point vectors of [2 x double] for equality. 951 /// 952 /// The comparison yields 0 for false, 1 for true. If either of the two 953 /// lower double-precision values is NaN, 0 is returned. 954 /// 955 /// \headerfile <x86intrin.h> 956 /// 957 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 958 /// 959 /// \param __a 960 /// A 128-bit vector of [2 x double]. The lower double-precision value is 961 /// compared to the lower double-precision value of \a __b. 962 /// \param __b 963 /// A 128-bit vector of [2 x double]. The lower double-precision value is 964 /// compared to the lower double-precision value of \a __a. 965 /// \returns An integer containing the comparison results. If either of the two 966 /// lower double-precision values is NaN, 0 is returned. 967 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, 968 __m128d __b) { 969 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 970 } 971 972 /// Compares the lower double-precision floating-point values in each of 973 /// the two 128-bit floating-point vectors of [2 x double] to determine if 974 /// the value in the first parameter is less than the corresponding value in 975 /// the second parameter. 976 /// 977 /// The comparison yields 0 for false, 1 for true. If either of the two 978 /// lower double-precision values is NaN, 0 is returned. 979 /// 980 /// \headerfile <x86intrin.h> 981 /// 982 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 983 /// 984 /// \param __a 985 /// A 128-bit vector of [2 x double]. The lower double-precision value is 986 /// compared to the lower double-precision value of \a __b. 987 /// \param __b 988 /// A 128-bit vector of [2 x double]. The lower double-precision value is 989 /// compared to the lower double-precision value of \a __a. 990 /// \returns An integer containing the comparison results. If either of the two 991 /// lower double-precision values is NaN, 0 is returned. 992 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, 993 __m128d __b) { 994 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 995 } 996 997 /// Compares the lower double-precision floating-point values in each of 998 /// the two 128-bit floating-point vectors of [2 x double] to determine if 999 /// the value in the first parameter is less than or equal to the 1000 /// corresponding value in the second parameter. 1001 /// 1002 /// The comparison yields 0 for false, 1 for true. If either of the two 1003 /// lower double-precision values is NaN, 0 is returned. 1004 /// 1005 /// \headerfile <x86intrin.h> 1006 /// 1007 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1008 /// 1009 /// \param __a 1010 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1011 /// compared to the lower double-precision value of \a __b. 1012 /// \param __b 1013 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1014 /// compared to the lower double-precision value of \a __a. 1015 /// \returns An integer containing the comparison results. If either of the two 1016 /// lower double-precision values is NaN, 0 is returned. 1017 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, 1018 __m128d __b) { 1019 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1020 } 1021 1022 /// Compares the lower double-precision floating-point values in each of 1023 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1024 /// the value in the first parameter is greater than the corresponding value 1025 /// in the second parameter. 1026 /// 1027 /// The comparison yields 0 for false, 1 for true. If either of the two 1028 /// lower double-precision values is NaN, 0 is returned. 1029 /// 1030 /// \headerfile <x86intrin.h> 1031 /// 1032 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1033 /// 1034 /// \param __a 1035 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1036 /// compared to the lower double-precision value of \a __b. 1037 /// \param __b 1038 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1039 /// compared to the lower double-precision value of \a __a. 1040 /// \returns An integer containing the comparison results. If either of the two 1041 /// lower double-precision values is NaN, 0 is returned. 1042 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, 1043 __m128d __b) { 1044 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1045 } 1046 1047 /// Compares the lower double-precision floating-point values in each of 1048 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1049 /// the value in the first parameter is greater than or equal to the 1050 /// corresponding value in the second parameter. 1051 /// 1052 /// The comparison yields 0 for false, 1 for true. If either of the two 1053 /// lower double-precision values is NaN, 0 is returned. 1054 /// 1055 /// \headerfile <x86intrin.h> 1056 /// 1057 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1058 /// 1059 /// \param __a 1060 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1061 /// compared to the lower double-precision value of \a __b. 1062 /// \param __b 1063 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1064 /// compared to the lower double-precision value of \a __a. 1065 /// \returns An integer containing the comparison results. If either of the two 1066 /// lower double-precision values is NaN, 0 is returned. 1067 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, 1068 __m128d __b) { 1069 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1070 } 1071 1072 /// Compares the lower double-precision floating-point values in each of 1073 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1074 /// the value in the first parameter is unequal to the corresponding value in 1075 /// the second parameter. 1076 /// 1077 /// The comparison yields 0 for false, 1 for true. If either of the two 1078 /// lower double-precision values is NaN, 1 is returned. 1079 /// 1080 /// \headerfile <x86intrin.h> 1081 /// 1082 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1083 /// 1084 /// \param __a 1085 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1086 /// compared to the lower double-precision value of \a __b. 1087 /// \param __b 1088 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1089 /// compared to the lower double-precision value of \a __a. 1090 /// \returns An integer containing the comparison results. If either of the two 1091 /// lower double-precision values is NaN, 1 is returned. 1092 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, 1093 __m128d __b) { 1094 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1095 } 1096 1097 /// Compares the lower double-precision floating-point values in each of 1098 /// the two 128-bit floating-point vectors of [2 x double] for equality. The 1099 /// comparison yields 0 for false, 1 for true. 1100 /// 1101 /// If either of the two lower double-precision values is NaN, 0 is returned. 1102 /// 1103 /// \headerfile <x86intrin.h> 1104 /// 1105 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1106 /// 1107 /// \param __a 1108 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1109 /// compared to the lower double-precision value of \a __b. 1110 /// \param __b 1111 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1112 /// compared to the lower double-precision value of \a __a. 1113 /// \returns An integer containing the comparison results. If either of the two 1114 /// lower double-precision values is NaN, 0 is returned. 1115 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, 1116 __m128d __b) { 1117 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1118 } 1119 1120 /// Compares the lower double-precision floating-point values in each of 1121 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1122 /// the value in the first parameter is less than the corresponding value in 1123 /// the second parameter. 1124 /// 1125 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1126 /// double-precision values is NaN, 0 is returned. 1127 /// 1128 /// \headerfile <x86intrin.h> 1129 /// 1130 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1131 /// 1132 /// \param __a 1133 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1134 /// compared to the lower double-precision value of \a __b. 1135 /// \param __b 1136 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1137 /// compared to the lower double-precision value of \a __a. 1138 /// \returns An integer containing the comparison results. If either of the two 1139 /// lower double-precision values is NaN, 0 is returned. 1140 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, 1141 __m128d __b) { 1142 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1143 } 1144 1145 /// Compares the lower double-precision floating-point values in each of 1146 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1147 /// the value in the first parameter is less than or equal to the 1148 /// corresponding value in the second parameter. 1149 /// 1150 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1151 /// double-precision values is NaN, 0 is returned. 1152 /// 1153 /// \headerfile <x86intrin.h> 1154 /// 1155 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1156 /// 1157 /// \param __a 1158 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1159 /// compared to the lower double-precision value of \a __b. 1160 /// \param __b 1161 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1162 /// compared to the lower double-precision value of \a __a. 1163 /// \returns An integer containing the comparison results. If either of the two 1164 /// lower double-precision values is NaN, 0 is returned. 1165 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, 1166 __m128d __b) { 1167 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1168 } 1169 1170 /// Compares the lower double-precision floating-point values in each of 1171 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1172 /// the value in the first parameter is greater than the corresponding value 1173 /// in the second parameter. 1174 /// 1175 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1176 /// double-precision values is NaN, 0 is returned. 1177 /// 1178 /// \headerfile <x86intrin.h> 1179 /// 1180 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1181 /// 1182 /// \param __a 1183 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1184 /// compared to the lower double-precision value of \a __b. 1185 /// \param __b 1186 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1187 /// compared to the lower double-precision value of \a __a. 1188 /// \returns An integer containing the comparison results. If either of the two 1189 /// lower double-precision values is NaN, 0 is returned. 1190 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, 1191 __m128d __b) { 1192 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1193 } 1194 1195 /// Compares the lower double-precision floating-point values in each of 1196 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1197 /// the value in the first parameter is greater than or equal to the 1198 /// corresponding value in the second parameter. 1199 /// 1200 /// The comparison yields 0 for false, 1 for true. If either of the two 1201 /// lower double-precision values is NaN, 0 is returned. 1202 /// 1203 /// \headerfile <x86intrin.h> 1204 /// 1205 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1206 /// 1207 /// \param __a 1208 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1209 /// compared to the lower double-precision value of \a __b. 1210 /// \param __b 1211 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1212 /// compared to the lower double-precision value of \a __a. 1213 /// \returns An integer containing the comparison results. If either of the two 1214 /// lower double-precision values is NaN, 0 is returned. 1215 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, 1216 __m128d __b) { 1217 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1218 } 1219 1220 /// Compares the lower double-precision floating-point values in each of 1221 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1222 /// the value in the first parameter is unequal to the corresponding value in 1223 /// the second parameter. 1224 /// 1225 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1226 /// double-precision values is NaN, 1 is returned. 1227 /// 1228 /// \headerfile <x86intrin.h> 1229 /// 1230 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1231 /// 1232 /// \param __a 1233 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1234 /// compared to the lower double-precision value of \a __b. 1235 /// \param __b 1236 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1237 /// compared to the lower double-precision value of \a __a. 1238 /// \returns An integer containing the comparison result. If either of the two 1239 /// lower double-precision values is NaN, 1 is returned. 1240 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, 1241 __m128d __b) { 1242 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1243 } 1244 1245 /// Converts the two double-precision floating-point elements of a 1246 /// 128-bit vector of [2 x double] into two single-precision floating-point 1247 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1248 /// The upper 64 bits of the result vector are set to zero. 1249 /// 1250 /// \headerfile <x86intrin.h> 1251 /// 1252 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1253 /// 1254 /// \param __a 1255 /// A 128-bit vector of [2 x double]. 1256 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1257 /// converted values. The upper 64 bits are set to zero. 1258 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { 1259 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1260 } 1261 1262 /// Converts the lower two single-precision floating-point elements of a 1263 /// 128-bit vector of [4 x float] into two double-precision floating-point 1264 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1265 /// elements of the input vector are unused. 1266 /// 1267 /// \headerfile <x86intrin.h> 1268 /// 1269 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1270 /// 1271 /// \param __a 1272 /// A 128-bit vector of [4 x float]. The lower two single-precision 1273 /// floating-point elements are converted to double-precision values. The 1274 /// upper two elements are unused. 1275 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1276 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { 1277 return (__m128d) __builtin_convertvector( 1278 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1279 } 1280 1281 /// Converts the lower two integer elements of a 128-bit vector of 1282 /// [4 x i32] into two double-precision floating-point values, returned in a 1283 /// 128-bit vector of [2 x double]. 1284 /// 1285 /// The upper two elements of the input vector are unused. 1286 /// 1287 /// \headerfile <x86intrin.h> 1288 /// 1289 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1290 /// 1291 /// \param __a 1292 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1293 /// converted to double-precision values. 1294 /// 1295 /// The upper two elements are unused. 1296 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1297 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { 1298 return (__m128d) __builtin_convertvector( 1299 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1300 } 1301 1302 /// Converts the two double-precision floating-point elements of a 1303 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1304 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1305 /// 64 bits of the result vector are set to zero. 1306 /// 1307 /// \headerfile <x86intrin.h> 1308 /// 1309 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1310 /// 1311 /// \param __a 1312 /// A 128-bit vector of [2 x double]. 1313 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1314 /// converted values. The upper 64 bits are set to zero. 1315 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { 1316 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1317 } 1318 1319 /// Converts the low-order element of a 128-bit vector of [2 x double] 1320 /// into a 32-bit signed integer value. 1321 /// 1322 /// \headerfile <x86intrin.h> 1323 /// 1324 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1325 /// 1326 /// \param __a 1327 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1328 /// conversion. 1329 /// \returns A 32-bit signed integer containing the converted value. 1330 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { 1331 return __builtin_ia32_cvtsd2si((__v2df)__a); 1332 } 1333 1334 /// Converts the lower double-precision floating-point element of a 1335 /// 128-bit vector of [2 x double], in the second parameter, into a 1336 /// single-precision floating-point value, returned in the lower 32 bits of a 1337 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1338 /// copied from the upper 96 bits of the first parameter. 1339 /// 1340 /// \headerfile <x86intrin.h> 1341 /// 1342 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1343 /// 1344 /// \param __a 1345 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1346 /// copied to the upper 96 bits of the result. 1347 /// \param __b 1348 /// A 128-bit vector of [2 x double]. The lower double-precision 1349 /// floating-point element is used in the conversion. 1350 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1351 /// converted value from the second parameter. The upper 96 bits are copied 1352 /// from the upper 96 bits of the first parameter. 1353 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, 1354 __m128d __b) { 1355 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1356 } 1357 1358 /// Converts a 32-bit signed integer value, in the second parameter, into 1359 /// a double-precision floating-point value, returned in the lower 64 bits of 1360 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1361 /// are copied from the upper 64 bits of the first parameter. 1362 /// 1363 /// \headerfile <x86intrin.h> 1364 /// 1365 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1366 /// 1367 /// \param __a 1368 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1369 /// copied to the upper 64 bits of the result. 1370 /// \param __b 1371 /// A 32-bit signed integer containing the value to be converted. 1372 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1373 /// converted value from the second parameter. The upper 64 bits are copied 1374 /// from the upper 64 bits of the first parameter. 1375 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, 1376 int __b) { 1377 __a[0] = __b; 1378 return __a; 1379 } 1380 1381 /// Converts the lower single-precision floating-point element of a 1382 /// 128-bit vector of [4 x float], in the second parameter, into a 1383 /// double-precision floating-point value, returned in the lower 64 bits of 1384 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1385 /// are copied from the upper 64 bits of the first parameter. 1386 /// 1387 /// \headerfile <x86intrin.h> 1388 /// 1389 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1390 /// 1391 /// \param __a 1392 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1393 /// copied to the upper 64 bits of the result. 1394 /// \param __b 1395 /// A 128-bit vector of [4 x float]. The lower single-precision 1396 /// floating-point element is used in the conversion. 1397 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1398 /// converted value from the second parameter. The upper 64 bits are copied 1399 /// from the upper 64 bits of the first parameter. 1400 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, 1401 __m128 __b) { 1402 __a[0] = __b[0]; 1403 return __a; 1404 } 1405 1406 /// Converts the two double-precision floating-point elements of a 1407 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1408 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. 1409 /// 1410 /// If the result of either conversion is inexact, the result is truncated 1411 /// (rounded towards zero) regardless of the current MXCSR setting. The upper 1412 /// 64 bits of the result vector are set to zero. 1413 /// 1414 /// \headerfile <x86intrin.h> 1415 /// 1416 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1417 /// instruction. 1418 /// 1419 /// \param __a 1420 /// A 128-bit vector of [2 x double]. 1421 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1422 /// converted values. The upper 64 bits are set to zero. 1423 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) { 1424 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1425 } 1426 1427 /// Converts the low-order element of a [2 x double] vector into a 32-bit 1428 /// signed integer value, truncating the result when it is inexact. 1429 /// 1430 /// \headerfile <x86intrin.h> 1431 /// 1432 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1433 /// instruction. 1434 /// 1435 /// \param __a 1436 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1437 /// conversion. 1438 /// \returns A 32-bit signed integer containing the converted value. 1439 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { 1440 return __builtin_ia32_cvttsd2si((__v2df)__a); 1441 } 1442 1443 /// Converts the two double-precision floating-point elements of a 1444 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1445 /// returned in a 64-bit vector of [2 x i32]. 1446 /// 1447 /// \headerfile <x86intrin.h> 1448 /// 1449 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1450 /// 1451 /// \param __a 1452 /// A 128-bit vector of [2 x double]. 1453 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1454 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { 1455 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1456 } 1457 1458 /// Converts the two double-precision floating-point elements of a 1459 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1460 /// returned in a 64-bit vector of [2 x i32]. 1461 /// 1462 /// If the result of either conversion is inexact, the result is truncated 1463 /// (rounded towards zero) regardless of the current MXCSR setting. 1464 /// 1465 /// \headerfile <x86intrin.h> 1466 /// 1467 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1468 /// 1469 /// \param __a 1470 /// A 128-bit vector of [2 x double]. 1471 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1472 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { 1473 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1474 } 1475 1476 /// Converts the two signed 32-bit integer elements of a 64-bit vector of 1477 /// [2 x i32] into two double-precision floating-point values, returned in a 1478 /// 128-bit vector of [2 x double]. 1479 /// 1480 /// \headerfile <x86intrin.h> 1481 /// 1482 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1483 /// 1484 /// \param __a 1485 /// A 64-bit vector of [2 x i32]. 1486 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1487 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { 1488 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1489 } 1490 1491 /// Returns the low-order element of a 128-bit vector of [2 x double] as 1492 /// a double-precision floating-point value. 1493 /// 1494 /// \headerfile <x86intrin.h> 1495 /// 1496 /// This intrinsic has no corresponding instruction. 1497 /// 1498 /// \param __a 1499 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1500 /// \returns A double-precision floating-point value copied from the lower 64 1501 /// bits of \a __a. 1502 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) { 1503 return __a[0]; 1504 } 1505 1506 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1507 /// memory location. 1508 /// 1509 /// \headerfile <x86intrin.h> 1510 /// 1511 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1512 /// 1513 /// \param __dp 1514 /// A pointer to a 128-bit memory location. The address of the memory 1515 /// location has to be 16-byte aligned. 1516 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1517 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) { 1518 return *(const __m128d *)__dp; 1519 } 1520 1521 /// Loads a double-precision floating-point value from a specified memory 1522 /// location and duplicates it to both vector elements of a 128-bit vector of 1523 /// [2 x double]. 1524 /// 1525 /// \headerfile <x86intrin.h> 1526 /// 1527 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1528 /// 1529 /// \param __dp 1530 /// A pointer to a memory location containing a double-precision value. 1531 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1532 /// duplicated values. 1533 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) { 1534 struct __mm_load1_pd_struct { 1535 double __u; 1536 } __attribute__((__packed__, __may_alias__)); 1537 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u; 1538 return __extension__(__m128d){__u, __u}; 1539 } 1540 1541 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1542 1543 /// Loads two double-precision values, in reverse order, from an aligned 1544 /// memory location into a 128-bit vector of [2 x double]. 1545 /// 1546 /// \headerfile <x86intrin.h> 1547 /// 1548 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1549 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1550 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1551 /// 1552 /// \param __dp 1553 /// A 16-byte aligned pointer to an array of double-precision values to be 1554 /// loaded in reverse order. 1555 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1556 /// values. 1557 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { 1558 __m128d __u = *(const __m128d *)__dp; 1559 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1560 } 1561 1562 /// Loads a 128-bit floating-point vector of [2 x double] from an 1563 /// unaligned memory location. 1564 /// 1565 /// \headerfile <x86intrin.h> 1566 /// 1567 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1568 /// 1569 /// \param __dp 1570 /// A pointer to a 128-bit memory location. The address of the memory 1571 /// location does not have to be aligned. 1572 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1573 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { 1574 struct __loadu_pd { 1575 __m128d_u __v; 1576 } __attribute__((__packed__, __may_alias__)); 1577 return ((const struct __loadu_pd *)__dp)->__v; 1578 } 1579 1580 /// Loads a 64-bit integer value to the low element of a 128-bit integer 1581 /// vector and clears the upper element. 1582 /// 1583 /// \headerfile <x86intrin.h> 1584 /// 1585 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1586 /// 1587 /// \param __a 1588 /// A pointer to a 64-bit memory location. The address of the memory 1589 /// location does not have to be aligned. 1590 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) { 1592 struct __loadu_si64 { 1593 long long __v; 1594 } __attribute__((__packed__, __may_alias__)); 1595 long long __u = ((const struct __loadu_si64 *)__a)->__v; 1596 return __extension__(__m128i)(__v2di){__u, 0LL}; 1597 } 1598 1599 /// Loads a 32-bit integer value to the low element of a 128-bit integer 1600 /// vector and clears the upper element. 1601 /// 1602 /// \headerfile <x86intrin.h> 1603 /// 1604 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1605 /// 1606 /// \param __a 1607 /// A pointer to a 32-bit memory location. The address of the memory 1608 /// location does not have to be aligned. 1609 /// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1610 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) { 1611 struct __loadu_si32 { 1612 int __v; 1613 } __attribute__((__packed__, __may_alias__)); 1614 int __u = ((const struct __loadu_si32 *)__a)->__v; 1615 return __extension__(__m128i)(__v4si){__u, 0, 0, 0}; 1616 } 1617 1618 /// Loads a 16-bit integer value to the low element of a 128-bit integer 1619 /// vector and clears the upper element. 1620 /// 1621 /// \headerfile <x86intrin.h> 1622 /// 1623 /// This intrinsic does not correspond to a specific instruction. 1624 /// 1625 /// \param __a 1626 /// A pointer to a 16-bit memory location. The address of the memory 1627 /// location does not have to be aligned. 1628 /// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1629 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) { 1630 struct __loadu_si16 { 1631 short __v; 1632 } __attribute__((__packed__, __may_alias__)); 1633 short __u = ((const struct __loadu_si16 *)__a)->__v; 1634 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1635 } 1636 1637 /// Loads a 64-bit double-precision value to the low element of a 1638 /// 128-bit integer vector and clears the upper element. 1639 /// 1640 /// \headerfile <x86intrin.h> 1641 /// 1642 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1643 /// 1644 /// \param __dp 1645 /// A pointer to a memory location containing a double-precision value. 1646 /// The address of the memory location does not have to be aligned. 1647 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1648 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) { 1649 struct __mm_load_sd_struct { 1650 double __u; 1651 } __attribute__((__packed__, __may_alias__)); 1652 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u; 1653 return __extension__(__m128d){__u, 0}; 1654 } 1655 1656 /// Loads a double-precision value into the high-order bits of a 128-bit 1657 /// vector of [2 x double]. The low-order bits are copied from the low-order 1658 /// bits of the first operand. 1659 /// 1660 /// \headerfile <x86intrin.h> 1661 /// 1662 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1663 /// 1664 /// \param __a 1665 /// A 128-bit vector of [2 x double]. \n 1666 /// Bits [63:0] are written to bits [63:0] of the result. 1667 /// \param __dp 1668 /// A pointer to a 64-bit memory location containing a double-precision 1669 /// floating-point value that is loaded. The loaded value is written to bits 1670 /// [127:64] of the result. The address of the memory location does not have 1671 /// to be aligned. 1672 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1673 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, 1674 double const *__dp) { 1675 struct __mm_loadh_pd_struct { 1676 double __u; 1677 } __attribute__((__packed__, __may_alias__)); 1678 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u; 1679 return __extension__(__m128d){__a[0], __u}; 1680 } 1681 1682 /// Loads a double-precision value into the low-order bits of a 128-bit 1683 /// vector of [2 x double]. The high-order bits are copied from the 1684 /// high-order bits of the first operand. 1685 /// 1686 /// \headerfile <x86intrin.h> 1687 /// 1688 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1689 /// 1690 /// \param __a 1691 /// A 128-bit vector of [2 x double]. \n 1692 /// Bits [127:64] are written to bits [127:64] of the result. 1693 /// \param __dp 1694 /// A pointer to a 64-bit memory location containing a double-precision 1695 /// floating-point value that is loaded. The loaded value is written to bits 1696 /// [63:0] of the result. The address of the memory location does not have to 1697 /// be aligned. 1698 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1699 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, 1700 double const *__dp) { 1701 struct __mm_loadl_pd_struct { 1702 double __u; 1703 } __attribute__((__packed__, __may_alias__)); 1704 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u; 1705 return __extension__(__m128d){__u, __a[1]}; 1706 } 1707 1708 /// Constructs a 128-bit floating-point vector of [2 x double] with 1709 /// unspecified content. This could be used as an argument to another 1710 /// intrinsic function where the argument is required but the value is not 1711 /// actually used. 1712 /// 1713 /// \headerfile <x86intrin.h> 1714 /// 1715 /// This intrinsic has no corresponding instruction. 1716 /// 1717 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1718 /// content. 1719 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { 1720 return (__m128d)__builtin_ia32_undef128(); 1721 } 1722 1723 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1724 /// 64 bits of the vector are initialized with the specified double-precision 1725 /// floating-point value. The upper 64 bits are set to zero. 1726 /// 1727 /// \headerfile <x86intrin.h> 1728 /// 1729 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1730 /// 1731 /// \param __w 1732 /// A double-precision floating-point value used to initialize the lower 64 1733 /// bits of the result. 1734 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1735 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1736 /// set to zero. 1737 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { 1738 return __extension__(__m128d){__w, 0}; 1739 } 1740 1741 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1742 /// of the two double-precision floating-point vector elements set to the 1743 /// specified double-precision floating-point value. 1744 /// 1745 /// \headerfile <x86intrin.h> 1746 /// 1747 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1748 /// 1749 /// \param __w 1750 /// A double-precision floating-point value used to initialize each vector 1751 /// element of the result. 1752 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1753 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { 1754 return __extension__(__m128d){__w, __w}; 1755 } 1756 1757 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1758 /// of the two double-precision floating-point vector elements set to the 1759 /// specified double-precision floating-point value. 1760 /// 1761 /// \headerfile <x86intrin.h> 1762 /// 1763 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1764 /// 1765 /// \param __w 1766 /// A double-precision floating-point value used to initialize each vector 1767 /// element of the result. 1768 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1769 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) { 1770 return _mm_set1_pd(__w); 1771 } 1772 1773 /// Constructs a 128-bit floating-point vector of [2 x double] 1774 /// initialized with the specified double-precision floating-point values. 1775 /// 1776 /// \headerfile <x86intrin.h> 1777 /// 1778 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1779 /// 1780 /// \param __w 1781 /// A double-precision floating-point value used to initialize the upper 64 1782 /// bits of the result. 1783 /// \param __x 1784 /// A double-precision floating-point value used to initialize the lower 64 1785 /// bits of the result. 1786 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1787 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, 1788 double __x) { 1789 return __extension__(__m128d){__x, __w}; 1790 } 1791 1792 /// Constructs a 128-bit floating-point vector of [2 x double], 1793 /// initialized in reverse order with the specified double-precision 1794 /// floating-point values. 1795 /// 1796 /// \headerfile <x86intrin.h> 1797 /// 1798 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1799 /// 1800 /// \param __w 1801 /// A double-precision floating-point value used to initialize the lower 64 1802 /// bits of the result. 1803 /// \param __x 1804 /// A double-precision floating-point value used to initialize the upper 64 1805 /// bits of the result. 1806 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1807 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, 1808 double __x) { 1809 return __extension__(__m128d){__w, __x}; 1810 } 1811 1812 /// Constructs a 128-bit floating-point vector of [2 x double] 1813 /// initialized to zero. 1814 /// 1815 /// \headerfile <x86intrin.h> 1816 /// 1817 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1818 /// 1819 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1820 /// all elements set to zero. 1821 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { 1822 return __extension__(__m128d){0.0, 0.0}; 1823 } 1824 1825 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1826 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1827 /// 64 bits are set to the upper 64 bits of the first parameter. 1828 /// 1829 /// \headerfile <x86intrin.h> 1830 /// 1831 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1832 /// 1833 /// \param __a 1834 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1835 /// upper 64 bits of the result. 1836 /// \param __b 1837 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1838 /// lower 64 bits of the result. 1839 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1840 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, 1841 __m128d __b) { 1842 __a[0] = __b[0]; 1843 return __a; 1844 } 1845 1846 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1847 /// memory location. 1848 /// 1849 /// \headerfile <x86intrin.h> 1850 /// 1851 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1852 /// 1853 /// \param __dp 1854 /// A pointer to a 64-bit memory location. 1855 /// \param __a 1856 /// A 128-bit vector of [2 x double] containing the value to be stored. 1857 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, 1858 __m128d __a) { 1859 struct __mm_store_sd_struct { 1860 double __u; 1861 } __attribute__((__packed__, __may_alias__)); 1862 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0]; 1863 } 1864 1865 /// Moves packed double-precision values from a 128-bit vector of 1866 /// [2 x double] to a memory location. 1867 /// 1868 /// \headerfile <x86intrin.h> 1869 /// 1870 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1871 /// 1872 /// \param __dp 1873 /// A pointer to an aligned memory location that can store two 1874 /// double-precision values. 1875 /// \param __a 1876 /// A packed 128-bit vector of [2 x double] containing the values to be 1877 /// moved. 1878 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, 1879 __m128d __a) { 1880 *(__m128d *)__dp = __a; 1881 } 1882 1883 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1884 /// the upper and lower 64 bits of a memory location. 1885 /// 1886 /// \headerfile <x86intrin.h> 1887 /// 1888 /// This intrinsic corresponds to the 1889 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1890 /// 1891 /// \param __dp 1892 /// A pointer to a memory location that can store two double-precision 1893 /// values. 1894 /// \param __a 1895 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1896 /// of the values in \a __dp. 1897 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, 1898 __m128d __a) { 1899 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1900 _mm_store_pd(__dp, __a); 1901 } 1902 1903 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1904 /// the upper and lower 64 bits of a memory location. 1905 /// 1906 /// \headerfile <x86intrin.h> 1907 /// 1908 /// This intrinsic corresponds to the 1909 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1910 /// 1911 /// \param __dp 1912 /// A pointer to a memory location that can store two double-precision 1913 /// values. 1914 /// \param __a 1915 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1916 /// of the values in \a __dp. 1917 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, 1918 __m128d __a) { 1919 _mm_store1_pd(__dp, __a); 1920 } 1921 1922 /// Stores a 128-bit vector of [2 x double] into an unaligned memory 1923 /// location. 1924 /// 1925 /// \headerfile <x86intrin.h> 1926 /// 1927 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1928 /// 1929 /// \param __dp 1930 /// A pointer to a 128-bit memory location. The address of the memory 1931 /// location does not have to be aligned. 1932 /// \param __a 1933 /// A 128-bit vector of [2 x double] containing the values to be stored. 1934 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, 1935 __m128d __a) { 1936 struct __storeu_pd { 1937 __m128d_u __v; 1938 } __attribute__((__packed__, __may_alias__)); 1939 ((struct __storeu_pd *)__dp)->__v = __a; 1940 } 1941 1942 /// Stores two double-precision values, in reverse order, from a 128-bit 1943 /// vector of [2 x double] to a 16-byte aligned memory location. 1944 /// 1945 /// \headerfile <x86intrin.h> 1946 /// 1947 /// This intrinsic corresponds to a shuffling instruction followed by a 1948 /// <c> VMOVAPD / MOVAPD </c> instruction. 1949 /// 1950 /// \param __dp 1951 /// A pointer to a 16-byte aligned memory location that can store two 1952 /// double-precision values. 1953 /// \param __a 1954 /// A 128-bit vector of [2 x double] containing the values to be reversed and 1955 /// stored. 1956 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, 1957 __m128d __a) { 1958 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 1959 *(__m128d *)__dp = __a; 1960 } 1961 1962 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 1963 /// memory location. 1964 /// 1965 /// \headerfile <x86intrin.h> 1966 /// 1967 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1968 /// 1969 /// \param __dp 1970 /// A pointer to a 64-bit memory location. 1971 /// \param __a 1972 /// A 128-bit vector of [2 x double] containing the value to be stored. 1973 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, 1974 __m128d __a) { 1975 struct __mm_storeh_pd_struct { 1976 double __u; 1977 } __attribute__((__packed__, __may_alias__)); 1978 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1]; 1979 } 1980 1981 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1982 /// memory location. 1983 /// 1984 /// \headerfile <x86intrin.h> 1985 /// 1986 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1987 /// 1988 /// \param __dp 1989 /// A pointer to a 64-bit memory location. 1990 /// \param __a 1991 /// A 128-bit vector of [2 x double] containing the value to be stored. 1992 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, 1993 __m128d __a) { 1994 struct __mm_storeh_pd_struct { 1995 double __u; 1996 } __attribute__((__packed__, __may_alias__)); 1997 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0]; 1998 } 1999 2000 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 2001 /// saving the lower 8 bits of each sum in the corresponding element of a 2002 /// 128-bit result vector of [16 x i8]. 2003 /// 2004 /// The integer elements of both parameters can be either signed or unsigned. 2005 /// 2006 /// \headerfile <x86intrin.h> 2007 /// 2008 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 2009 /// 2010 /// \param __a 2011 /// A 128-bit vector of [16 x i8]. 2012 /// \param __b 2013 /// A 128-bit vector of [16 x i8]. 2014 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2015 /// parameters. 2016 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, 2017 __m128i __b) { 2018 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2019 } 2020 2021 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2022 /// saving the lower 16 bits of each sum in the corresponding element of a 2023 /// 128-bit result vector of [8 x i16]. 2024 /// 2025 /// The integer elements of both parameters can be either signed or unsigned. 2026 /// 2027 /// \headerfile <x86intrin.h> 2028 /// 2029 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2030 /// 2031 /// \param __a 2032 /// A 128-bit vector of [8 x i16]. 2033 /// \param __b 2034 /// A 128-bit vector of [8 x i16]. 2035 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2036 /// parameters. 2037 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, 2038 __m128i __b) { 2039 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2040 } 2041 2042 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2043 /// saving the lower 32 bits of each sum in the corresponding element of a 2044 /// 128-bit result vector of [4 x i32]. 2045 /// 2046 /// The integer elements of both parameters can be either signed or unsigned. 2047 /// 2048 /// \headerfile <x86intrin.h> 2049 /// 2050 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2051 /// 2052 /// \param __a 2053 /// A 128-bit vector of [4 x i32]. 2054 /// \param __b 2055 /// A 128-bit vector of [4 x i32]. 2056 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2057 /// parameters. 2058 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, 2059 __m128i __b) { 2060 return (__m128i)((__v4su)__a + (__v4su)__b); 2061 } 2062 2063 /// Adds two signed or unsigned 64-bit integer values, returning the 2064 /// lower 64 bits of the sum. 2065 /// 2066 /// \headerfile <x86intrin.h> 2067 /// 2068 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2069 /// 2070 /// \param __a 2071 /// A 64-bit integer. 2072 /// \param __b 2073 /// A 64-bit integer. 2074 /// \returns A 64-bit integer containing the sum of both parameters. 2075 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, 2076 __m64 __b) { 2077 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2078 } 2079 2080 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2081 /// saving the lower 64 bits of each sum in the corresponding element of a 2082 /// 128-bit result vector of [2 x i64]. 2083 /// 2084 /// The integer elements of both parameters can be either signed or unsigned. 2085 /// 2086 /// \headerfile <x86intrin.h> 2087 /// 2088 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2089 /// 2090 /// \param __a 2091 /// A 128-bit vector of [2 x i64]. 2092 /// \param __b 2093 /// A 128-bit vector of [2 x i64]. 2094 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2095 /// parameters. 2096 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, 2097 __m128i __b) { 2098 return (__m128i)((__v2du)__a + (__v2du)__b); 2099 } 2100 2101 /// Adds, with saturation, the corresponding elements of two 128-bit 2102 /// signed [16 x i8] vectors, saving each sum in the corresponding element of 2103 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are 2104 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. 2105 /// 2106 /// \headerfile <x86intrin.h> 2107 /// 2108 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2109 /// 2110 /// \param __a 2111 /// A 128-bit signed [16 x i8] vector. 2112 /// \param __b 2113 /// A 128-bit signed [16 x i8] vector. 2114 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2115 /// both parameters. 2116 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, 2117 __m128i __b) { 2118 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); 2119 } 2120 2121 /// Adds, with saturation, the corresponding elements of two 128-bit 2122 /// signed [8 x i16] vectors, saving each sum in the corresponding element of 2123 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF 2124 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to 2125 /// 0x8000. 2126 /// 2127 /// \headerfile <x86intrin.h> 2128 /// 2129 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2130 /// 2131 /// \param __a 2132 /// A 128-bit signed [8 x i16] vector. 2133 /// \param __b 2134 /// A 128-bit signed [8 x i16] vector. 2135 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2136 /// both parameters. 2137 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, 2138 __m128i __b) { 2139 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); 2140 } 2141 2142 /// Adds, with saturation, the corresponding elements of two 128-bit 2143 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2144 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF 2145 /// are saturated to 0xFF. Negative sums are saturated to 0x00. 2146 /// 2147 /// \headerfile <x86intrin.h> 2148 /// 2149 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2150 /// 2151 /// \param __a 2152 /// A 128-bit unsigned [16 x i8] vector. 2153 /// \param __b 2154 /// A 128-bit unsigned [16 x i8] vector. 2155 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2156 /// of both parameters. 2157 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, 2158 __m128i __b) { 2159 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); 2160 } 2161 2162 /// Adds, with saturation, the corresponding elements of two 128-bit 2163 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2164 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than 2165 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. 2166 /// 2167 /// \headerfile <x86intrin.h> 2168 /// 2169 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2170 /// 2171 /// \param __a 2172 /// A 128-bit unsigned [8 x i16] vector. 2173 /// \param __b 2174 /// A 128-bit unsigned [8 x i16] vector. 2175 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2176 /// of both parameters. 2177 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, 2178 __m128i __b) { 2179 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); 2180 } 2181 2182 /// Computes the rounded averages of corresponding elements of two 2183 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2184 /// corresponding element of a 128-bit result vector of [16 x i8]. 2185 /// 2186 /// \headerfile <x86intrin.h> 2187 /// 2188 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2189 /// 2190 /// \param __a 2191 /// A 128-bit unsigned [16 x i8] vector. 2192 /// \param __b 2193 /// A 128-bit unsigned [16 x i8] vector. 2194 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2195 /// averages of both parameters. 2196 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, 2197 __m128i __b) { 2198 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2199 } 2200 2201 /// Computes the rounded averages of corresponding elements of two 2202 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2203 /// corresponding element of a 128-bit result vector of [8 x i16]. 2204 /// 2205 /// \headerfile <x86intrin.h> 2206 /// 2207 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2208 /// 2209 /// \param __a 2210 /// A 128-bit unsigned [8 x i16] vector. 2211 /// \param __b 2212 /// A 128-bit unsigned [8 x i16] vector. 2213 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2214 /// averages of both parameters. 2215 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, 2216 __m128i __b) { 2217 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2218 } 2219 2220 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2221 /// vectors, producing eight intermediate 32-bit signed integer products, and 2222 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2223 /// [4 x i32] vector. 2224 /// 2225 /// For example, bits [15:0] of both parameters are multiplied producing a 2226 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2227 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2228 /// of the result. 2229 /// 2230 /// \headerfile <x86intrin.h> 2231 /// 2232 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2233 /// 2234 /// \param __a 2235 /// A 128-bit signed [8 x i16] vector. 2236 /// \param __b 2237 /// A 128-bit signed [8 x i16] vector. 2238 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2239 /// of both parameters. 2240 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, 2241 __m128i __b) { 2242 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2243 } 2244 2245 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2246 /// vectors, saving the greater value from each comparison in the 2247 /// corresponding element of a 128-bit result vector of [8 x i16]. 2248 /// 2249 /// \headerfile <x86intrin.h> 2250 /// 2251 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2252 /// 2253 /// \param __a 2254 /// A 128-bit signed [8 x i16] vector. 2255 /// \param __b 2256 /// A 128-bit signed [8 x i16] vector. 2257 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2258 /// each comparison. 2259 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, 2260 __m128i __b) { 2261 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); 2262 } 2263 2264 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2265 /// vectors, saving the greater value from each comparison in the 2266 /// corresponding element of a 128-bit result vector of [16 x i8]. 2267 /// 2268 /// \headerfile <x86intrin.h> 2269 /// 2270 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2271 /// 2272 /// \param __a 2273 /// A 128-bit unsigned [16 x i8] vector. 2274 /// \param __b 2275 /// A 128-bit unsigned [16 x i8] vector. 2276 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2277 /// each comparison. 2278 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, 2279 __m128i __b) { 2280 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); 2281 } 2282 2283 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2284 /// vectors, saving the smaller value from each comparison in the 2285 /// corresponding element of a 128-bit result vector of [8 x i16]. 2286 /// 2287 /// \headerfile <x86intrin.h> 2288 /// 2289 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2290 /// 2291 /// \param __a 2292 /// A 128-bit signed [8 x i16] vector. 2293 /// \param __b 2294 /// A 128-bit signed [8 x i16] vector. 2295 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2296 /// each comparison. 2297 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, 2298 __m128i __b) { 2299 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); 2300 } 2301 2302 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2303 /// vectors, saving the smaller value from each comparison in the 2304 /// corresponding element of a 128-bit result vector of [16 x i8]. 2305 /// 2306 /// \headerfile <x86intrin.h> 2307 /// 2308 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2309 /// 2310 /// \param __a 2311 /// A 128-bit unsigned [16 x i8] vector. 2312 /// \param __b 2313 /// A 128-bit unsigned [16 x i8] vector. 2314 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2315 /// each comparison. 2316 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, 2317 __m128i __b) { 2318 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); 2319 } 2320 2321 /// Multiplies the corresponding elements of two signed [8 x i16] 2322 /// vectors, saving the upper 16 bits of each 32-bit product in the 2323 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2324 /// 2325 /// \headerfile <x86intrin.h> 2326 /// 2327 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2328 /// 2329 /// \param __a 2330 /// A 128-bit signed [8 x i16] vector. 2331 /// \param __b 2332 /// A 128-bit signed [8 x i16] vector. 2333 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2334 /// each of the eight 32-bit products. 2335 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, 2336 __m128i __b) { 2337 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2338 } 2339 2340 /// Multiplies the corresponding elements of two unsigned [8 x i16] 2341 /// vectors, saving the upper 16 bits of each 32-bit product in the 2342 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2343 /// 2344 /// \headerfile <x86intrin.h> 2345 /// 2346 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2347 /// 2348 /// \param __a 2349 /// A 128-bit unsigned [8 x i16] vector. 2350 /// \param __b 2351 /// A 128-bit unsigned [8 x i16] vector. 2352 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2353 /// of each of the eight 32-bit products. 2354 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, 2355 __m128i __b) { 2356 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2357 } 2358 2359 /// Multiplies the corresponding elements of two signed [8 x i16] 2360 /// vectors, saving the lower 16 bits of each 32-bit product in the 2361 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2362 /// 2363 /// \headerfile <x86intrin.h> 2364 /// 2365 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2366 /// 2367 /// \param __a 2368 /// A 128-bit signed [8 x i16] vector. 2369 /// \param __b 2370 /// A 128-bit signed [8 x i16] vector. 2371 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2372 /// each of the eight 32-bit products. 2373 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, 2374 __m128i __b) { 2375 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2376 } 2377 2378 /// Multiplies 32-bit unsigned integer values contained in the lower bits 2379 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2380 /// product. 2381 /// 2382 /// \headerfile <x86intrin.h> 2383 /// 2384 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2385 /// 2386 /// \param __a 2387 /// A 64-bit integer containing one of the source operands. 2388 /// \param __b 2389 /// A 64-bit integer containing one of the source operands. 2390 /// \returns A 64-bit integer vector containing the product of both operands. 2391 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, 2392 __m64 __b) { 2393 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2394 } 2395 2396 /// Multiplies 32-bit unsigned integer values contained in the lower 2397 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2398 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2399 /// 2400 /// \headerfile <x86intrin.h> 2401 /// 2402 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2403 /// 2404 /// \param __a 2405 /// A [2 x i64] vector containing one of the source operands. 2406 /// \param __b 2407 /// A [2 x i64] vector containing one of the source operands. 2408 /// \returns A [2 x i64] vector containing the product of both operands. 2409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, 2410 __m128i __b) { 2411 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2412 } 2413 2414 /// Computes the absolute differences of corresponding 8-bit integer 2415 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2416 /// separately sums the second 8 absolute differences. Packs these two 2417 /// unsigned 16-bit integer sums into the upper and lower elements of a 2418 /// [2 x i64] vector. 2419 /// 2420 /// \headerfile <x86intrin.h> 2421 /// 2422 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2423 /// 2424 /// \param __a 2425 /// A 128-bit integer vector containing one of the source operands. 2426 /// \param __b 2427 /// A 128-bit integer vector containing one of the source operands. 2428 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2429 /// differences between both operands. 2430 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, 2431 __m128i __b) { 2432 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2433 } 2434 2435 /// Subtracts the corresponding 8-bit integer values in the operands. 2436 /// 2437 /// \headerfile <x86intrin.h> 2438 /// 2439 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2440 /// 2441 /// \param __a 2442 /// A 128-bit integer vector containing the minuends. 2443 /// \param __b 2444 /// A 128-bit integer vector containing the subtrahends. 2445 /// \returns A 128-bit integer vector containing the differences of the values 2446 /// in the operands. 2447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, 2448 __m128i __b) { 2449 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2450 } 2451 2452 /// Subtracts the corresponding 16-bit integer values in the operands. 2453 /// 2454 /// \headerfile <x86intrin.h> 2455 /// 2456 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2457 /// 2458 /// \param __a 2459 /// A 128-bit integer vector containing the minuends. 2460 /// \param __b 2461 /// A 128-bit integer vector containing the subtrahends. 2462 /// \returns A 128-bit integer vector containing the differences of the values 2463 /// in the operands. 2464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, 2465 __m128i __b) { 2466 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2467 } 2468 2469 /// Subtracts the corresponding 32-bit integer values in the operands. 2470 /// 2471 /// \headerfile <x86intrin.h> 2472 /// 2473 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2474 /// 2475 /// \param __a 2476 /// A 128-bit integer vector containing the minuends. 2477 /// \param __b 2478 /// A 128-bit integer vector containing the subtrahends. 2479 /// \returns A 128-bit integer vector containing the differences of the values 2480 /// in the operands. 2481 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, 2482 __m128i __b) { 2483 return (__m128i)((__v4su)__a - (__v4su)__b); 2484 } 2485 2486 /// Subtracts signed or unsigned 64-bit integer values and writes the 2487 /// difference to the corresponding bits in the destination. 2488 /// 2489 /// \headerfile <x86intrin.h> 2490 /// 2491 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2492 /// 2493 /// \param __a 2494 /// A 64-bit integer vector containing the minuend. 2495 /// \param __b 2496 /// A 64-bit integer vector containing the subtrahend. 2497 /// \returns A 64-bit integer vector containing the difference of the values in 2498 /// the operands. 2499 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, 2500 __m64 __b) { 2501 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2502 } 2503 2504 /// Subtracts the corresponding elements of two [2 x i64] vectors. 2505 /// 2506 /// \headerfile <x86intrin.h> 2507 /// 2508 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2509 /// 2510 /// \param __a 2511 /// A 128-bit integer vector containing the minuends. 2512 /// \param __b 2513 /// A 128-bit integer vector containing the subtrahends. 2514 /// \returns A 128-bit integer vector containing the differences of the values 2515 /// in the operands. 2516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, 2517 __m128i __b) { 2518 return (__m128i)((__v2du)__a - (__v2du)__b); 2519 } 2520 2521 /// Subtracts corresponding 8-bit signed integer values in the input and 2522 /// returns the differences in the corresponding bytes in the destination. 2523 /// Differences greater than 0x7F are saturated to 0x7F, and differences less 2524 /// than 0x80 are saturated to 0x80. 2525 /// 2526 /// \headerfile <x86intrin.h> 2527 /// 2528 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2529 /// 2530 /// \param __a 2531 /// A 128-bit integer vector containing the minuends. 2532 /// \param __b 2533 /// A 128-bit integer vector containing the subtrahends. 2534 /// \returns A 128-bit integer vector containing the differences of the values 2535 /// in the operands. 2536 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, 2537 __m128i __b) { 2538 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); 2539 } 2540 2541 /// Subtracts corresponding 16-bit signed integer values in the input and 2542 /// returns the differences in the corresponding bytes in the destination. 2543 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2544 /// than 0x8000 are saturated to 0x8000. 2545 /// 2546 /// \headerfile <x86intrin.h> 2547 /// 2548 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2549 /// 2550 /// \param __a 2551 /// A 128-bit integer vector containing the minuends. 2552 /// \param __b 2553 /// A 128-bit integer vector containing the subtrahends. 2554 /// \returns A 128-bit integer vector containing the differences of the values 2555 /// in the operands. 2556 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, 2557 __m128i __b) { 2558 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); 2559 } 2560 2561 /// Subtracts corresponding 8-bit unsigned integer values in the input 2562 /// and returns the differences in the corresponding bytes in the 2563 /// destination. Differences less than 0x00 are saturated to 0x00. 2564 /// 2565 /// \headerfile <x86intrin.h> 2566 /// 2567 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2568 /// 2569 /// \param __a 2570 /// A 128-bit integer vector containing the minuends. 2571 /// \param __b 2572 /// A 128-bit integer vector containing the subtrahends. 2573 /// \returns A 128-bit integer vector containing the unsigned integer 2574 /// differences of the values in the operands. 2575 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, 2576 __m128i __b) { 2577 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); 2578 } 2579 2580 /// Subtracts corresponding 16-bit unsigned integer values in the input 2581 /// and returns the differences in the corresponding bytes in the 2582 /// destination. Differences less than 0x0000 are saturated to 0x0000. 2583 /// 2584 /// \headerfile <x86intrin.h> 2585 /// 2586 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2587 /// 2588 /// \param __a 2589 /// A 128-bit integer vector containing the minuends. 2590 /// \param __b 2591 /// A 128-bit integer vector containing the subtrahends. 2592 /// \returns A 128-bit integer vector containing the unsigned integer 2593 /// differences of the values in the operands. 2594 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, 2595 __m128i __b) { 2596 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); 2597 } 2598 2599 /// Performs a bitwise AND of two 128-bit integer vectors. 2600 /// 2601 /// \headerfile <x86intrin.h> 2602 /// 2603 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2604 /// 2605 /// \param __a 2606 /// A 128-bit integer vector containing one of the source operands. 2607 /// \param __b 2608 /// A 128-bit integer vector containing one of the source operands. 2609 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2610 /// in both operands. 2611 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, 2612 __m128i __b) { 2613 return (__m128i)((__v2du)__a & (__v2du)__b); 2614 } 2615 2616 /// Performs a bitwise AND of two 128-bit integer vectors, using the 2617 /// one's complement of the values contained in the first source operand. 2618 /// 2619 /// \headerfile <x86intrin.h> 2620 /// 2621 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2622 /// 2623 /// \param __a 2624 /// A 128-bit vector containing the left source operand. The one's complement 2625 /// of this value is used in the bitwise AND. 2626 /// \param __b 2627 /// A 128-bit vector containing the right source operand. 2628 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2629 /// complement of the first operand and the values in the second operand. 2630 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, 2631 __m128i __b) { 2632 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2633 } 2634 /// Performs a bitwise OR of two 128-bit integer vectors. 2635 /// 2636 /// \headerfile <x86intrin.h> 2637 /// 2638 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2639 /// 2640 /// \param __a 2641 /// A 128-bit integer vector containing one of the source operands. 2642 /// \param __b 2643 /// A 128-bit integer vector containing one of the source operands. 2644 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2645 /// in both operands. 2646 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, 2647 __m128i __b) { 2648 return (__m128i)((__v2du)__a | (__v2du)__b); 2649 } 2650 2651 /// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2652 /// 2653 /// \headerfile <x86intrin.h> 2654 /// 2655 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2656 /// 2657 /// \param __a 2658 /// A 128-bit integer vector containing one of the source operands. 2659 /// \param __b 2660 /// A 128-bit integer vector containing one of the source operands. 2661 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2662 /// values in both operands. 2663 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, 2664 __m128i __b) { 2665 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2666 } 2667 2668 /// Left-shifts the 128-bit integer vector operand by the specified 2669 /// number of bytes. Low-order bits are cleared. 2670 /// 2671 /// \headerfile <x86intrin.h> 2672 /// 2673 /// \code 2674 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2675 /// \endcode 2676 /// 2677 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2678 /// 2679 /// \param a 2680 /// A 128-bit integer vector containing the source operand. 2681 /// \param imm 2682 /// An immediate value specifying the number of bytes to left-shift operand 2683 /// \a a. 2684 /// \returns A 128-bit integer vector containing the left-shifted value. 2685 #define _mm_slli_si128(a, imm) \ 2686 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2687 (int)(imm))) 2688 2689 #define _mm_bslli_si128(a, imm) \ 2690 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2691 (int)(imm))) 2692 2693 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2694 /// by the specified number of bits. Low-order bits are cleared. 2695 /// 2696 /// \headerfile <x86intrin.h> 2697 /// 2698 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2699 /// 2700 /// \param __a 2701 /// A 128-bit integer vector containing the source operand. 2702 /// \param __count 2703 /// An integer value specifying the number of bits to left-shift each value 2704 /// in operand \a __a. 2705 /// \returns A 128-bit integer vector containing the left-shifted values. 2706 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, 2707 int __count) { 2708 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2709 } 2710 2711 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2712 /// by the specified number of bits. Low-order bits are cleared. 2713 /// 2714 /// \headerfile <x86intrin.h> 2715 /// 2716 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2717 /// 2718 /// \param __a 2719 /// A 128-bit integer vector containing the source operand. 2720 /// \param __count 2721 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2722 /// to left-shift each value in operand \a __a. 2723 /// \returns A 128-bit integer vector containing the left-shifted values. 2724 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, 2725 __m128i __count) { 2726 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2727 } 2728 2729 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2730 /// by the specified number of bits. Low-order bits are cleared. 2731 /// 2732 /// \headerfile <x86intrin.h> 2733 /// 2734 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2735 /// 2736 /// \param __a 2737 /// A 128-bit integer vector containing the source operand. 2738 /// \param __count 2739 /// An integer value specifying the number of bits to left-shift each value 2740 /// in operand \a __a. 2741 /// \returns A 128-bit integer vector containing the left-shifted values. 2742 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, 2743 int __count) { 2744 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2745 } 2746 2747 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2748 /// by the specified number of bits. Low-order bits are cleared. 2749 /// 2750 /// \headerfile <x86intrin.h> 2751 /// 2752 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2753 /// 2754 /// \param __a 2755 /// A 128-bit integer vector containing the source operand. 2756 /// \param __count 2757 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2758 /// to left-shift each value in operand \a __a. 2759 /// \returns A 128-bit integer vector containing the left-shifted values. 2760 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, 2761 __m128i __count) { 2762 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2763 } 2764 2765 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2766 /// by the specified number of bits. Low-order bits are cleared. 2767 /// 2768 /// \headerfile <x86intrin.h> 2769 /// 2770 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2771 /// 2772 /// \param __a 2773 /// A 128-bit integer vector containing the source operand. 2774 /// \param __count 2775 /// An integer value specifying the number of bits to left-shift each value 2776 /// in operand \a __a. 2777 /// \returns A 128-bit integer vector containing the left-shifted values. 2778 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, 2779 int __count) { 2780 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2781 } 2782 2783 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2784 /// by the specified number of bits. Low-order bits are cleared. 2785 /// 2786 /// \headerfile <x86intrin.h> 2787 /// 2788 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2789 /// 2790 /// \param __a 2791 /// A 128-bit integer vector containing the source operand. 2792 /// \param __count 2793 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2794 /// to left-shift each value in operand \a __a. 2795 /// \returns A 128-bit integer vector containing the left-shifted values. 2796 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, 2797 __m128i __count) { 2798 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2799 } 2800 2801 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2802 /// by the specified number of bits. High-order bits are filled with the sign 2803 /// bit of the initial value. 2804 /// 2805 /// \headerfile <x86intrin.h> 2806 /// 2807 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2808 /// 2809 /// \param __a 2810 /// A 128-bit integer vector containing the source operand. 2811 /// \param __count 2812 /// An integer value specifying the number of bits to right-shift each value 2813 /// in operand \a __a. 2814 /// \returns A 128-bit integer vector containing the right-shifted values. 2815 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, 2816 int __count) { 2817 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2818 } 2819 2820 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2821 /// by the specified number of bits. High-order bits are filled with the sign 2822 /// bit of the initial value. 2823 /// 2824 /// \headerfile <x86intrin.h> 2825 /// 2826 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2827 /// 2828 /// \param __a 2829 /// A 128-bit integer vector containing the source operand. 2830 /// \param __count 2831 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2832 /// to right-shift each value in operand \a __a. 2833 /// \returns A 128-bit integer vector containing the right-shifted values. 2834 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, 2835 __m128i __count) { 2836 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2837 } 2838 2839 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2840 /// by the specified number of bits. High-order bits are filled with the sign 2841 /// bit of the initial value. 2842 /// 2843 /// \headerfile <x86intrin.h> 2844 /// 2845 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2846 /// 2847 /// \param __a 2848 /// A 128-bit integer vector containing the source operand. 2849 /// \param __count 2850 /// An integer value specifying the number of bits to right-shift each value 2851 /// in operand \a __a. 2852 /// \returns A 128-bit integer vector containing the right-shifted values. 2853 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, 2854 int __count) { 2855 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2856 } 2857 2858 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2859 /// by the specified number of bits. High-order bits are filled with the sign 2860 /// bit of the initial value. 2861 /// 2862 /// \headerfile <x86intrin.h> 2863 /// 2864 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2865 /// 2866 /// \param __a 2867 /// A 128-bit integer vector containing the source operand. 2868 /// \param __count 2869 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2870 /// to right-shift each value in operand \a __a. 2871 /// \returns A 128-bit integer vector containing the right-shifted values. 2872 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, 2873 __m128i __count) { 2874 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2875 } 2876 2877 /// Right-shifts the 128-bit integer vector operand by the specified 2878 /// number of bytes. High-order bits are cleared. 2879 /// 2880 /// \headerfile <x86intrin.h> 2881 /// 2882 /// \code 2883 /// __m128i _mm_srli_si128(__m128i a, const int imm); 2884 /// \endcode 2885 /// 2886 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 2887 /// 2888 /// \param a 2889 /// A 128-bit integer vector containing the source operand. 2890 /// \param imm 2891 /// An immediate value specifying the number of bytes to right-shift operand 2892 /// \a a. 2893 /// \returns A 128-bit integer vector containing the right-shifted value. 2894 #define _mm_srli_si128(a, imm) \ 2895 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2896 (int)(imm))) 2897 2898 #define _mm_bsrli_si128(a, imm) \ 2899 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2900 (int)(imm))) 2901 2902 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2903 /// operand by the specified number of bits. High-order bits are cleared. 2904 /// 2905 /// \headerfile <x86intrin.h> 2906 /// 2907 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2908 /// 2909 /// \param __a 2910 /// A 128-bit integer vector containing the source operand. 2911 /// \param __count 2912 /// An integer value specifying the number of bits to right-shift each value 2913 /// in operand \a __a. 2914 /// \returns A 128-bit integer vector containing the right-shifted values. 2915 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, 2916 int __count) { 2917 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 2918 } 2919 2920 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2921 /// operand by the specified number of bits. High-order bits are cleared. 2922 /// 2923 /// \headerfile <x86intrin.h> 2924 /// 2925 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2926 /// 2927 /// \param __a 2928 /// A 128-bit integer vector containing the source operand. 2929 /// \param __count 2930 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2931 /// to right-shift each value in operand \a __a. 2932 /// \returns A 128-bit integer vector containing the right-shifted values. 2933 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, 2934 __m128i __count) { 2935 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 2936 } 2937 2938 /// Right-shifts each of 32-bit values in the 128-bit integer vector 2939 /// operand by the specified number of bits. High-order bits are cleared. 2940 /// 2941 /// \headerfile <x86intrin.h> 2942 /// 2943 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2944 /// 2945 /// \param __a 2946 /// A 128-bit integer vector containing the source operand. 2947 /// \param __count 2948 /// An integer value specifying the number of bits to right-shift each value 2949 /// in operand \a __a. 2950 /// \returns A 128-bit integer vector containing the right-shifted values. 2951 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, 2952 int __count) { 2953 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 2954 } 2955 2956 /// Right-shifts each of 32-bit values in the 128-bit integer vector 2957 /// operand by the specified number of bits. High-order bits are cleared. 2958 /// 2959 /// \headerfile <x86intrin.h> 2960 /// 2961 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2962 /// 2963 /// \param __a 2964 /// A 128-bit integer vector containing the source operand. 2965 /// \param __count 2966 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2967 /// to right-shift each value in operand \a __a. 2968 /// \returns A 128-bit integer vector containing the right-shifted values. 2969 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, 2970 __m128i __count) { 2971 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 2972 } 2973 2974 /// Right-shifts each of 64-bit values in the 128-bit integer vector 2975 /// operand by the specified number of bits. High-order bits are cleared. 2976 /// 2977 /// \headerfile <x86intrin.h> 2978 /// 2979 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 2980 /// 2981 /// \param __a 2982 /// A 128-bit integer vector containing the source operand. 2983 /// \param __count 2984 /// An integer value specifying the number of bits to right-shift each value 2985 /// in operand \a __a. 2986 /// \returns A 128-bit integer vector containing the right-shifted values. 2987 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, 2988 int __count) { 2989 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 2990 } 2991 2992 /// Right-shifts each of 64-bit values in the 128-bit integer vector 2993 /// operand by the specified number of bits. High-order bits are cleared. 2994 /// 2995 /// \headerfile <x86intrin.h> 2996 /// 2997 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 2998 /// 2999 /// \param __a 3000 /// A 128-bit integer vector containing the source operand. 3001 /// \param __count 3002 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 3003 /// to right-shift each value in operand \a __a. 3004 /// \returns A 128-bit integer vector containing the right-shifted values. 3005 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, 3006 __m128i __count) { 3007 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 3008 } 3009 3010 /// Compares each of the corresponding 8-bit values of the 128-bit 3011 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF 3012 /// for true. 3013 /// 3014 /// \headerfile <x86intrin.h> 3015 /// 3016 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3017 /// 3018 /// \param __a 3019 /// A 128-bit integer vector. 3020 /// \param __b 3021 /// A 128-bit integer vector. 3022 /// \returns A 128-bit integer vector containing the comparison results. 3023 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, 3024 __m128i __b) { 3025 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3026 } 3027 3028 /// Compares each of the corresponding 16-bit values of the 128-bit 3029 /// integer vectors for equality. Each comparison yields 0x0 for false, 3030 /// 0xFFFF for true. 3031 /// 3032 /// \headerfile <x86intrin.h> 3033 /// 3034 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3035 /// 3036 /// \param __a 3037 /// A 128-bit integer vector. 3038 /// \param __b 3039 /// A 128-bit integer vector. 3040 /// \returns A 128-bit integer vector containing the comparison results. 3041 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, 3042 __m128i __b) { 3043 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3044 } 3045 3046 /// Compares each of the corresponding 32-bit values of the 128-bit 3047 /// integer vectors for equality. Each comparison yields 0x0 for false, 3048 /// 0xFFFFFFFF for true. 3049 /// 3050 /// \headerfile <x86intrin.h> 3051 /// 3052 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3053 /// 3054 /// \param __a 3055 /// A 128-bit integer vector. 3056 /// \param __b 3057 /// A 128-bit integer vector. 3058 /// \returns A 128-bit integer vector containing the comparison results. 3059 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, 3060 __m128i __b) { 3061 return (__m128i)((__v4si)__a == (__v4si)__b); 3062 } 3063 3064 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3065 /// integer vectors to determine if the values in the first operand are 3066 /// greater than those in the second operand. Each comparison yields 0x0 for 3067 /// false, 0xFF for true. 3068 /// 3069 /// \headerfile <x86intrin.h> 3070 /// 3071 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3072 /// 3073 /// \param __a 3074 /// A 128-bit integer vector. 3075 /// \param __b 3076 /// A 128-bit integer vector. 3077 /// \returns A 128-bit integer vector containing the comparison results. 3078 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, 3079 __m128i __b) { 3080 /* This function always performs a signed comparison, but __v16qi is a char 3081 which may be signed or unsigned, so use __v16qs. */ 3082 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3083 } 3084 3085 /// Compares each of the corresponding signed 16-bit values of the 3086 /// 128-bit integer vectors to determine if the values in the first operand 3087 /// are greater than those in the second operand. 3088 /// 3089 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3090 /// 3091 /// \headerfile <x86intrin.h> 3092 /// 3093 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3094 /// 3095 /// \param __a 3096 /// A 128-bit integer vector. 3097 /// \param __b 3098 /// A 128-bit integer vector. 3099 /// \returns A 128-bit integer vector containing the comparison results. 3100 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, 3101 __m128i __b) { 3102 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3103 } 3104 3105 /// Compares each of the corresponding signed 32-bit values of the 3106 /// 128-bit integer vectors to determine if the values in the first operand 3107 /// are greater than those in the second operand. 3108 /// 3109 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3110 /// 3111 /// \headerfile <x86intrin.h> 3112 /// 3113 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3114 /// 3115 /// \param __a 3116 /// A 128-bit integer vector. 3117 /// \param __b 3118 /// A 128-bit integer vector. 3119 /// \returns A 128-bit integer vector containing the comparison results. 3120 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, 3121 __m128i __b) { 3122 return (__m128i)((__v4si)__a > (__v4si)__b); 3123 } 3124 3125 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3126 /// integer vectors to determine if the values in the first operand are less 3127 /// than those in the second operand. 3128 /// 3129 /// Each comparison yields 0x0 for false, 0xFF for true. 3130 /// 3131 /// \headerfile <x86intrin.h> 3132 /// 3133 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3134 /// 3135 /// \param __a 3136 /// A 128-bit integer vector. 3137 /// \param __b 3138 /// A 128-bit integer vector. 3139 /// \returns A 128-bit integer vector containing the comparison results. 3140 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, 3141 __m128i __b) { 3142 return _mm_cmpgt_epi8(__b, __a); 3143 } 3144 3145 /// Compares each of the corresponding signed 16-bit values of the 3146 /// 128-bit integer vectors to determine if the values in the first operand 3147 /// are less than those in the second operand. 3148 /// 3149 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3150 /// 3151 /// \headerfile <x86intrin.h> 3152 /// 3153 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3154 /// 3155 /// \param __a 3156 /// A 128-bit integer vector. 3157 /// \param __b 3158 /// A 128-bit integer vector. 3159 /// \returns A 128-bit integer vector containing the comparison results. 3160 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, 3161 __m128i __b) { 3162 return _mm_cmpgt_epi16(__b, __a); 3163 } 3164 3165 /// Compares each of the corresponding signed 32-bit values of the 3166 /// 128-bit integer vectors to determine if the values in the first operand 3167 /// are less than those in the second operand. 3168 /// 3169 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3170 /// 3171 /// \headerfile <x86intrin.h> 3172 /// 3173 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3174 /// 3175 /// \param __a 3176 /// A 128-bit integer vector. 3177 /// \param __b 3178 /// A 128-bit integer vector. 3179 /// \returns A 128-bit integer vector containing the comparison results. 3180 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, 3181 __m128i __b) { 3182 return _mm_cmpgt_epi32(__b, __a); 3183 } 3184 3185 #ifdef __x86_64__ 3186 /// Converts a 64-bit signed integer value from the second operand into a 3187 /// double-precision value and returns it in the lower element of a [2 x 3188 /// double] vector; the upper element of the returned vector is copied from 3189 /// the upper element of the first operand. 3190 /// 3191 /// \headerfile <x86intrin.h> 3192 /// 3193 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3194 /// 3195 /// \param __a 3196 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3197 /// copied to the upper 64 bits of the destination. 3198 /// \param __b 3199 /// A 64-bit signed integer operand containing the value to be converted. 3200 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3201 /// converted value of the second operand. The upper 64 bits are copied from 3202 /// the upper 64 bits of the first operand. 3203 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a, 3204 long long __b) { 3205 __a[0] = __b; 3206 return __a; 3207 } 3208 3209 /// Converts the first (lower) element of a vector of [2 x double] into a 3210 /// 64-bit signed integer value, according to the current rounding mode. 3211 /// 3212 /// \headerfile <x86intrin.h> 3213 /// 3214 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3215 /// 3216 /// \param __a 3217 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3218 /// conversion. 3219 /// \returns A 64-bit signed integer containing the converted value. 3220 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) { 3221 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3222 } 3223 3224 /// Converts the first (lower) element of a vector of [2 x double] into a 3225 /// 64-bit signed integer value, truncating the result when it is inexact. 3226 /// 3227 /// \headerfile <x86intrin.h> 3228 /// 3229 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3230 /// instruction. 3231 /// 3232 /// \param __a 3233 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3234 /// conversion. 3235 /// \returns A 64-bit signed integer containing the converted value. 3236 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) { 3237 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3238 } 3239 #endif 3240 3241 /// Converts a vector of [4 x i32] into a vector of [4 x float]. 3242 /// 3243 /// \headerfile <x86intrin.h> 3244 /// 3245 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3246 /// 3247 /// \param __a 3248 /// A 128-bit integer vector. 3249 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3250 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { 3251 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf); 3252 } 3253 3254 /// Converts a vector of [4 x float] into a vector of [4 x i32]. 3255 /// 3256 /// \headerfile <x86intrin.h> 3257 /// 3258 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3259 /// 3260 /// \param __a 3261 /// A 128-bit vector of [4 x float]. 3262 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3263 /// values. 3264 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) { 3265 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3266 } 3267 3268 /// Converts a vector of [4 x float] into a vector of [4 x i32], 3269 /// truncating the result when it is inexact. 3270 /// 3271 /// \headerfile <x86intrin.h> 3272 /// 3273 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3274 /// instruction. 3275 /// 3276 /// \param __a 3277 /// A 128-bit vector of [4 x float]. 3278 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3279 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) { 3280 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3281 } 3282 3283 /// Returns a vector of [4 x i32] where the lowest element is the input 3284 /// operand and the remaining elements are zero. 3285 /// 3286 /// \headerfile <x86intrin.h> 3287 /// 3288 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3289 /// 3290 /// \param __a 3291 /// A 32-bit signed integer operand. 3292 /// \returns A 128-bit vector of [4 x i32]. 3293 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { 3294 return __extension__(__m128i)(__v4si){__a, 0, 0, 0}; 3295 } 3296 3297 /// Returns a vector of [2 x i64] where the lower element is the input 3298 /// operand and the upper element is zero. 3299 /// 3300 /// \headerfile <x86intrin.h> 3301 /// 3302 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction 3303 /// in 64-bit mode. 3304 /// 3305 /// \param __a 3306 /// A 64-bit signed integer operand containing the value to be converted. 3307 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3308 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { 3309 return __extension__(__m128i)(__v2di){__a, 0}; 3310 } 3311 3312 /// Moves the least significant 32 bits of a vector of [4 x i32] to a 3313 /// 32-bit signed integer value. 3314 /// 3315 /// \headerfile <x86intrin.h> 3316 /// 3317 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3318 /// 3319 /// \param __a 3320 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3321 /// destination. 3322 /// \returns A 32-bit signed integer containing the moved value. 3323 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { 3324 __v4si __b = (__v4si)__a; 3325 return __b[0]; 3326 } 3327 3328 /// Moves the least significant 64 bits of a vector of [2 x i64] to a 3329 /// 64-bit signed integer value. 3330 /// 3331 /// \headerfile <x86intrin.h> 3332 /// 3333 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3334 /// 3335 /// \param __a 3336 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3337 /// destination. 3338 /// \returns A 64-bit signed integer containing the moved value. 3339 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) { 3340 return __a[0]; 3341 } 3342 3343 /// Moves packed integer values from an aligned 128-bit memory location 3344 /// to elements in a 128-bit integer vector. 3345 /// 3346 /// \headerfile <x86intrin.h> 3347 /// 3348 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3349 /// 3350 /// \param __p 3351 /// An aligned pointer to a memory location containing integer values. 3352 /// \returns A 128-bit integer vector containing the moved values. 3353 static __inline__ __m128i __DEFAULT_FN_ATTRS 3354 _mm_load_si128(__m128i const *__p) { 3355 return *__p; 3356 } 3357 3358 /// Moves packed integer values from an unaligned 128-bit memory location 3359 /// to elements in a 128-bit integer vector. 3360 /// 3361 /// \headerfile <x86intrin.h> 3362 /// 3363 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3364 /// 3365 /// \param __p 3366 /// A pointer to a memory location containing integer values. 3367 /// \returns A 128-bit integer vector containing the moved values. 3368 static __inline__ __m128i __DEFAULT_FN_ATTRS 3369 _mm_loadu_si128(__m128i_u const *__p) { 3370 struct __loadu_si128 { 3371 __m128i_u __v; 3372 } __attribute__((__packed__, __may_alias__)); 3373 return ((const struct __loadu_si128 *)__p)->__v; 3374 } 3375 3376 /// Returns a vector of [2 x i64] where the lower element is taken from 3377 /// the lower element of the operand, and the upper element is zero. 3378 /// 3379 /// \headerfile <x86intrin.h> 3380 /// 3381 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3382 /// 3383 /// \param __p 3384 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3385 /// the destination. 3386 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3387 /// moved value. The higher order bits are cleared. 3388 static __inline__ __m128i __DEFAULT_FN_ATTRS 3389 _mm_loadl_epi64(__m128i_u const *__p) { 3390 struct __mm_loadl_epi64_struct { 3391 long long __u; 3392 } __attribute__((__packed__, __may_alias__)); 3393 return __extension__(__m128i){ 3394 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0}; 3395 } 3396 3397 /// Generates a 128-bit vector of [4 x i32] with unspecified content. 3398 /// This could be used as an argument to another intrinsic function where the 3399 /// argument is required but the value is not actually used. 3400 /// 3401 /// \headerfile <x86intrin.h> 3402 /// 3403 /// This intrinsic has no corresponding instruction. 3404 /// 3405 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3406 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) { 3407 return (__m128i)__builtin_ia32_undef128(); 3408 } 3409 3410 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3411 /// the specified 64-bit integer values. 3412 /// 3413 /// \headerfile <x86intrin.h> 3414 /// 3415 /// This intrinsic is a utility function and does not correspond to a specific 3416 /// instruction. 3417 /// 3418 /// \param __q1 3419 /// A 64-bit integer value used to initialize the upper 64 bits of the 3420 /// destination vector of [2 x i64]. 3421 /// \param __q0 3422 /// A 64-bit integer value used to initialize the lower 64 bits of the 3423 /// destination vector of [2 x i64]. 3424 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3425 /// provided in the operands. 3426 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, 3427 long long __q0) { 3428 return __extension__(__m128i)(__v2di){__q0, __q1}; 3429 } 3430 3431 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3432 /// the specified 64-bit integer values. 3433 /// 3434 /// \headerfile <x86intrin.h> 3435 /// 3436 /// This intrinsic is a utility function and does not correspond to a specific 3437 /// instruction. 3438 /// 3439 /// \param __q1 3440 /// A 64-bit integer value used to initialize the upper 64 bits of the 3441 /// destination vector of [2 x i64]. 3442 /// \param __q0 3443 /// A 64-bit integer value used to initialize the lower 64 bits of the 3444 /// destination vector of [2 x i64]. 3445 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3446 /// provided in the operands. 3447 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, 3448 __m64 __q0) { 3449 return _mm_set_epi64x((long long)__q1, (long long)__q0); 3450 } 3451 3452 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3453 /// the specified 32-bit integer values. 3454 /// 3455 /// \headerfile <x86intrin.h> 3456 /// 3457 /// This intrinsic is a utility function and does not correspond to a specific 3458 /// instruction. 3459 /// 3460 /// \param __i3 3461 /// A 32-bit integer value used to initialize bits [127:96] of the 3462 /// destination vector. 3463 /// \param __i2 3464 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3465 /// vector. 3466 /// \param __i1 3467 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3468 /// vector. 3469 /// \param __i0 3470 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3471 /// vector. 3472 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3473 /// provided in the operands. 3474 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, 3475 int __i1, int __i0) { 3476 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3}; 3477 } 3478 3479 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3480 /// the specified 16-bit integer values. 3481 /// 3482 /// \headerfile <x86intrin.h> 3483 /// 3484 /// This intrinsic is a utility function and does not correspond to a specific 3485 /// instruction. 3486 /// 3487 /// \param __w7 3488 /// A 16-bit integer value used to initialize bits [127:112] of the 3489 /// destination vector. 3490 /// \param __w6 3491 /// A 16-bit integer value used to initialize bits [111:96] of the 3492 /// destination vector. 3493 /// \param __w5 3494 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3495 /// vector. 3496 /// \param __w4 3497 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3498 /// vector. 3499 /// \param __w3 3500 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3501 /// vector. 3502 /// \param __w2 3503 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3504 /// vector. 3505 /// \param __w1 3506 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3507 /// vector. 3508 /// \param __w0 3509 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3510 /// vector. 3511 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3512 /// provided in the operands. 3513 static __inline__ __m128i __DEFAULT_FN_ATTRS 3514 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, 3515 short __w2, short __w1, short __w0) { 3516 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3, 3517 __w4, __w5, __w6, __w7}; 3518 } 3519 3520 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3521 /// the specified 8-bit integer values. 3522 /// 3523 /// \headerfile <x86intrin.h> 3524 /// 3525 /// This intrinsic is a utility function and does not correspond to a specific 3526 /// instruction. 3527 /// 3528 /// \param __b15 3529 /// Initializes bits [127:120] of the destination vector. 3530 /// \param __b14 3531 /// Initializes bits [119:112] of the destination vector. 3532 /// \param __b13 3533 /// Initializes bits [111:104] of the destination vector. 3534 /// \param __b12 3535 /// Initializes bits [103:96] of the destination vector. 3536 /// \param __b11 3537 /// Initializes bits [95:88] of the destination vector. 3538 /// \param __b10 3539 /// Initializes bits [87:80] of the destination vector. 3540 /// \param __b9 3541 /// Initializes bits [79:72] of the destination vector. 3542 /// \param __b8 3543 /// Initializes bits [71:64] of the destination vector. 3544 /// \param __b7 3545 /// Initializes bits [63:56] of the destination vector. 3546 /// \param __b6 3547 /// Initializes bits [55:48] of the destination vector. 3548 /// \param __b5 3549 /// Initializes bits [47:40] of the destination vector. 3550 /// \param __b4 3551 /// Initializes bits [39:32] of the destination vector. 3552 /// \param __b3 3553 /// Initializes bits [31:24] of the destination vector. 3554 /// \param __b2 3555 /// Initializes bits [23:16] of the destination vector. 3556 /// \param __b1 3557 /// Initializes bits [15:8] of the destination vector. 3558 /// \param __b0 3559 /// Initializes bits [7:0] of the destination vector. 3560 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3561 /// provided in the operands. 3562 static __inline__ __m128i __DEFAULT_FN_ATTRS 3563 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, 3564 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, 3565 char __b4, char __b3, char __b2, char __b1, char __b0) { 3566 return __extension__(__m128i)(__v16qi){ 3567 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, 3568 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15}; 3569 } 3570 3571 /// Initializes both values in a 128-bit integer vector with the 3572 /// specified 64-bit integer value. 3573 /// 3574 /// \headerfile <x86intrin.h> 3575 /// 3576 /// This intrinsic is a utility function and does not correspond to a specific 3577 /// instruction. 3578 /// 3579 /// \param __q 3580 /// Integer value used to initialize the elements of the destination integer 3581 /// vector. 3582 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3583 /// elements containing the value provided in the operand. 3584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { 3585 return _mm_set_epi64x(__q, __q); 3586 } 3587 3588 /// Initializes both values in a 128-bit vector of [2 x i64] with the 3589 /// specified 64-bit value. 3590 /// 3591 /// \headerfile <x86intrin.h> 3592 /// 3593 /// This intrinsic is a utility function and does not correspond to a specific 3594 /// instruction. 3595 /// 3596 /// \param __q 3597 /// A 64-bit value used to initialize the elements of the destination integer 3598 /// vector. 3599 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3600 /// containing the value provided in the operand. 3601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { 3602 return _mm_set_epi64(__q, __q); 3603 } 3604 3605 /// Initializes all values in a 128-bit vector of [4 x i32] with the 3606 /// specified 32-bit value. 3607 /// 3608 /// \headerfile <x86intrin.h> 3609 /// 3610 /// This intrinsic is a utility function and does not correspond to a specific 3611 /// instruction. 3612 /// 3613 /// \param __i 3614 /// A 32-bit value used to initialize the elements of the destination integer 3615 /// vector. 3616 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3617 /// containing the value provided in the operand. 3618 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { 3619 return _mm_set_epi32(__i, __i, __i, __i); 3620 } 3621 3622 /// Initializes all values in a 128-bit vector of [8 x i16] with the 3623 /// specified 16-bit value. 3624 /// 3625 /// \headerfile <x86intrin.h> 3626 /// 3627 /// This intrinsic is a utility function and does not correspond to a specific 3628 /// instruction. 3629 /// 3630 /// \param __w 3631 /// A 16-bit value used to initialize the elements of the destination integer 3632 /// vector. 3633 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3634 /// containing the value provided in the operand. 3635 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { 3636 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3637 } 3638 3639 /// Initializes all values in a 128-bit vector of [16 x i8] with the 3640 /// specified 8-bit value. 3641 /// 3642 /// \headerfile <x86intrin.h> 3643 /// 3644 /// This intrinsic is a utility function and does not correspond to a specific 3645 /// instruction. 3646 /// 3647 /// \param __b 3648 /// An 8-bit value used to initialize the elements of the destination integer 3649 /// vector. 3650 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3651 /// containing the value provided in the operand. 3652 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { 3653 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 3654 __b, __b, __b, __b, __b); 3655 } 3656 3657 /// Constructs a 128-bit integer vector, initialized in reverse order 3658 /// with the specified 64-bit integral values. 3659 /// 3660 /// \headerfile <x86intrin.h> 3661 /// 3662 /// This intrinsic does not correspond to a specific instruction. 3663 /// 3664 /// \param __q0 3665 /// A 64-bit integral value used to initialize the lower 64 bits of the 3666 /// result. 3667 /// \param __q1 3668 /// A 64-bit integral value used to initialize the upper 64 bits of the 3669 /// result. 3670 /// \returns An initialized 128-bit integer vector. 3671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, 3672 __m64 __q1) { 3673 return _mm_set_epi64(__q1, __q0); 3674 } 3675 3676 /// Constructs a 128-bit integer vector, initialized in reverse order 3677 /// with the specified 32-bit integral values. 3678 /// 3679 /// \headerfile <x86intrin.h> 3680 /// 3681 /// This intrinsic is a utility function and does not correspond to a specific 3682 /// instruction. 3683 /// 3684 /// \param __i0 3685 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3686 /// \param __i1 3687 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3688 /// \param __i2 3689 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3690 /// \param __i3 3691 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3692 /// \returns An initialized 128-bit integer vector. 3693 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, 3694 int __i2, 3695 int __i3) { 3696 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3697 } 3698 3699 /// Constructs a 128-bit integer vector, initialized in reverse order 3700 /// with the specified 16-bit integral values. 3701 /// 3702 /// \headerfile <x86intrin.h> 3703 /// 3704 /// This intrinsic is a utility function and does not correspond to a specific 3705 /// instruction. 3706 /// 3707 /// \param __w0 3708 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3709 /// \param __w1 3710 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3711 /// \param __w2 3712 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3713 /// \param __w3 3714 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3715 /// \param __w4 3716 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3717 /// \param __w5 3718 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3719 /// \param __w6 3720 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3721 /// \param __w7 3722 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3723 /// \returns An initialized 128-bit integer vector. 3724 static __inline__ __m128i __DEFAULT_FN_ATTRS 3725 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, 3726 short __w5, short __w6, short __w7) { 3727 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3728 } 3729 3730 /// Constructs a 128-bit integer vector, initialized in reverse order 3731 /// with the specified 8-bit integral values. 3732 /// 3733 /// \headerfile <x86intrin.h> 3734 /// 3735 /// This intrinsic is a utility function and does not correspond to a specific 3736 /// instruction. 3737 /// 3738 /// \param __b0 3739 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3740 /// \param __b1 3741 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3742 /// \param __b2 3743 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3744 /// \param __b3 3745 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3746 /// \param __b4 3747 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3748 /// \param __b5 3749 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3750 /// \param __b6 3751 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3752 /// \param __b7 3753 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3754 /// \param __b8 3755 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3756 /// \param __b9 3757 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3758 /// \param __b10 3759 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3760 /// \param __b11 3761 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3762 /// \param __b12 3763 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3764 /// \param __b13 3765 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3766 /// \param __b14 3767 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3768 /// \param __b15 3769 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3770 /// \returns An initialized 128-bit integer vector. 3771 static __inline__ __m128i __DEFAULT_FN_ATTRS 3772 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, 3773 char __b6, char __b7, char __b8, char __b9, char __b10, 3774 char __b11, char __b12, char __b13, char __b14, char __b15) { 3775 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, 3776 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3777 } 3778 3779 /// Creates a 128-bit integer vector initialized to zero. 3780 /// 3781 /// \headerfile <x86intrin.h> 3782 /// 3783 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3784 /// 3785 /// \returns An initialized 128-bit integer vector with all elements set to 3786 /// zero. 3787 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { 3788 return __extension__(__m128i)(__v2di){0LL, 0LL}; 3789 } 3790 3791 /// Stores a 128-bit integer vector to a memory location aligned on a 3792 /// 128-bit boundary. 3793 /// 3794 /// \headerfile <x86intrin.h> 3795 /// 3796 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3797 /// 3798 /// \param __p 3799 /// A pointer to an aligned memory location that will receive the integer 3800 /// values. 3801 /// \param __b 3802 /// A 128-bit integer vector containing the values to be moved. 3803 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, 3804 __m128i __b) { 3805 *__p = __b; 3806 } 3807 3808 /// Stores a 128-bit integer vector to an unaligned memory location. 3809 /// 3810 /// \headerfile <x86intrin.h> 3811 /// 3812 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 3813 /// 3814 /// \param __p 3815 /// A pointer to a memory location that will receive the integer values. 3816 /// \param __b 3817 /// A 128-bit integer vector containing the values to be moved. 3818 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, 3819 __m128i __b) { 3820 struct __storeu_si128 { 3821 __m128i_u __v; 3822 } __attribute__((__packed__, __may_alias__)); 3823 ((struct __storeu_si128 *)__p)->__v = __b; 3824 } 3825 3826 /// Stores a 64-bit integer value from the low element of a 128-bit integer 3827 /// vector. 3828 /// 3829 /// \headerfile <x86intrin.h> 3830 /// 3831 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3832 /// 3833 /// \param __p 3834 /// A pointer to a 64-bit memory location. The address of the memory 3835 /// location does not have to be aligned. 3836 /// \param __b 3837 /// A 128-bit integer vector containing the value to be stored. 3838 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, 3839 __m128i __b) { 3840 struct __storeu_si64 { 3841 long long __v; 3842 } __attribute__((__packed__, __may_alias__)); 3843 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0]; 3844 } 3845 3846 /// Stores a 32-bit integer value from the low element of a 128-bit integer 3847 /// vector. 3848 /// 3849 /// \headerfile <x86intrin.h> 3850 /// 3851 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3852 /// 3853 /// \param __p 3854 /// A pointer to a 32-bit memory location. The address of the memory 3855 /// location does not have to be aligned. 3856 /// \param __b 3857 /// A 128-bit integer vector containing the value to be stored. 3858 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, 3859 __m128i __b) { 3860 struct __storeu_si32 { 3861 int __v; 3862 } __attribute__((__packed__, __may_alias__)); 3863 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0]; 3864 } 3865 3866 /// Stores a 16-bit integer value from the low element of a 128-bit integer 3867 /// vector. 3868 /// 3869 /// \headerfile <x86intrin.h> 3870 /// 3871 /// This intrinsic does not correspond to a specific instruction. 3872 /// 3873 /// \param __p 3874 /// A pointer to a 16-bit memory location. The address of the memory 3875 /// location does not have to be aligned. 3876 /// \param __b 3877 /// A 128-bit integer vector containing the value to be stored. 3878 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, 3879 __m128i __b) { 3880 struct __storeu_si16 { 3881 short __v; 3882 } __attribute__((__packed__, __may_alias__)); 3883 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0]; 3884 } 3885 3886 /// Moves bytes selected by the mask from the first operand to the 3887 /// specified unaligned memory location. When a mask bit is 1, the 3888 /// corresponding byte is written, otherwise it is not written. 3889 /// 3890 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3891 /// used again soon). Exception and trap behavior for elements not selected 3892 /// for storage to memory are implementation dependent. 3893 /// 3894 /// \headerfile <x86intrin.h> 3895 /// 3896 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 3897 /// instruction. 3898 /// 3899 /// \param __d 3900 /// A 128-bit integer vector containing the values to be moved. 3901 /// \param __n 3902 /// A 128-bit integer vector containing the mask. The most significant bit of 3903 /// each byte represents the mask bits. 3904 /// \param __p 3905 /// A pointer to an unaligned 128-bit memory location where the specified 3906 /// values are moved. 3907 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, 3908 __m128i __n, 3909 char *__p) { 3910 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 3911 } 3912 3913 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 3914 /// a memory location. 3915 /// 3916 /// \headerfile <x86intrin.h> 3917 /// 3918 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 3919 /// 3920 /// \param __p 3921 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 3922 /// of the integer vector parameter. 3923 /// \param __a 3924 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 3925 /// value to be stored. 3926 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, 3927 __m128i __a) { 3928 struct __mm_storel_epi64_struct { 3929 long long __u; 3930 } __attribute__((__packed__, __may_alias__)); 3931 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0]; 3932 } 3933 3934 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 3935 /// aligned memory location. 3936 /// 3937 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3938 /// used again soon). 3939 /// 3940 /// \headerfile <x86intrin.h> 3941 /// 3942 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 3943 /// 3944 /// \param __p 3945 /// A pointer to the 128-bit aligned memory location used to store the value. 3946 /// \param __a 3947 /// A vector of [2 x double] containing the 64-bit values to be stored. 3948 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, 3949 __m128d __a) { 3950 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p); 3951 } 3952 3953 /// Stores a 128-bit integer vector to a 128-bit aligned memory location. 3954 /// 3955 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3956 /// used again soon). 3957 /// 3958 /// \headerfile <x86intrin.h> 3959 /// 3960 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 3961 /// 3962 /// \param __p 3963 /// A pointer to the 128-bit aligned memory location used to store the value. 3964 /// \param __a 3965 /// A 128-bit integer vector containing the values to be stored. 3966 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, 3967 __m128i __a) { 3968 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p); 3969 } 3970 3971 /// Stores a 32-bit integer value in the specified memory location. 3972 /// 3973 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3974 /// used again soon). 3975 /// 3976 /// \headerfile <x86intrin.h> 3977 /// 3978 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 3979 /// 3980 /// \param __p 3981 /// A pointer to the 32-bit memory location used to store the value. 3982 /// \param __a 3983 /// A 32-bit integer containing the value to be stored. 3984 static __inline__ void 3985 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 3986 _mm_stream_si32(int *__p, int __a) { 3987 __builtin_ia32_movnti(__p, __a); 3988 } 3989 3990 #ifdef __x86_64__ 3991 /// Stores a 64-bit integer value in the specified memory location. 3992 /// 3993 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3994 /// used again soon). 3995 /// 3996 /// \headerfile <x86intrin.h> 3997 /// 3998 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 3999 /// 4000 /// \param __p 4001 /// A pointer to the 64-bit memory location used to store the value. 4002 /// \param __a 4003 /// A 64-bit integer containing the value to be stored. 4004 static __inline__ void 4005 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 4006 _mm_stream_si64(long long *__p, long long __a) { 4007 __builtin_ia32_movnti64(__p, __a); 4008 } 4009 #endif 4010 4011 #if defined(__cplusplus) 4012 extern "C" { 4013 #endif 4014 4015 /// The cache line containing \a __p is flushed and invalidated from all 4016 /// caches in the coherency domain. 4017 /// 4018 /// \headerfile <x86intrin.h> 4019 /// 4020 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4021 /// 4022 /// \param __p 4023 /// A pointer to the memory location used to identify the cache line to be 4024 /// flushed. 4025 void _mm_clflush(void const *__p); 4026 4027 /// Forces strong memory ordering (serialization) between load 4028 /// instructions preceding this instruction and load instructions following 4029 /// this instruction, ensuring the system completes all previous loads before 4030 /// executing subsequent loads. 4031 /// 4032 /// \headerfile <x86intrin.h> 4033 /// 4034 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4035 /// 4036 void _mm_lfence(void); 4037 4038 /// Forces strong memory ordering (serialization) between load and store 4039 /// instructions preceding this instruction and load and store instructions 4040 /// following this instruction, ensuring that the system completes all 4041 /// previous memory accesses before executing subsequent memory accesses. 4042 /// 4043 /// \headerfile <x86intrin.h> 4044 /// 4045 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4046 /// 4047 void _mm_mfence(void); 4048 4049 #if defined(__cplusplus) 4050 } // extern "C" 4051 #endif 4052 4053 /// Converts 16-bit signed integers from both 128-bit integer vector 4054 /// operands into 8-bit signed integers, and packs the results into the 4055 /// destination. Positive values greater than 0x7F are saturated to 0x7F. 4056 /// Negative values less than 0x80 are saturated to 0x80. 4057 /// 4058 /// \headerfile <x86intrin.h> 4059 /// 4060 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4061 /// 4062 /// \param __a 4063 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4064 /// a signed integer and is converted to a 8-bit signed integer with 4065 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4066 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4067 /// written to the lower 64 bits of the result. 4068 /// \param __b 4069 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4070 /// a signed integer and is converted to a 8-bit signed integer with 4071 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4072 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4073 /// written to the higher 64 bits of the result. 4074 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4075 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, 4076 __m128i __b) { 4077 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4078 } 4079 4080 /// Converts 32-bit signed integers from both 128-bit integer vector 4081 /// operands into 16-bit signed integers, and packs the results into the 4082 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4083 /// Negative values less than 0x8000 are saturated to 0x8000. 4084 /// 4085 /// \headerfile <x86intrin.h> 4086 /// 4087 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4088 /// 4089 /// \param __a 4090 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4091 /// a signed integer and is converted to a 16-bit signed integer with 4092 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4093 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4094 /// are written to the lower 64 bits of the result. 4095 /// \param __b 4096 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4097 /// a signed integer and is converted to a 16-bit signed integer with 4098 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4099 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4100 /// are written to the higher 64 bits of the result. 4101 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4102 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, 4103 __m128i __b) { 4104 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4105 } 4106 4107 /// Converts 16-bit signed integers from both 128-bit integer vector 4108 /// operands into 8-bit unsigned integers, and packs the results into the 4109 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4110 /// than 0x00 are saturated to 0x00. 4111 /// 4112 /// \headerfile <x86intrin.h> 4113 /// 4114 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4115 /// 4116 /// \param __a 4117 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4118 /// a signed integer and is converted to an 8-bit unsigned integer with 4119 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4120 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4121 /// written to the lower 64 bits of the result. 4122 /// \param __b 4123 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4124 /// a signed integer and is converted to an 8-bit unsigned integer with 4125 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4126 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4127 /// written to the higher 64 bits of the result. 4128 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4129 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, 4130 __m128i __b) { 4131 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4132 } 4133 4134 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4135 /// the immediate-value parameter as a selector. 4136 /// 4137 /// \headerfile <x86intrin.h> 4138 /// 4139 /// \code 4140 /// __m128i _mm_extract_epi16(__m128i a, const int imm); 4141 /// \endcode 4142 /// 4143 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4144 /// 4145 /// \param a 4146 /// A 128-bit integer vector. 4147 /// \param imm 4148 /// An immediate value. Bits [2:0] selects values from \a a to be assigned 4149 /// to bits[15:0] of the result. \n 4150 /// 000: assign values from bits [15:0] of \a a. \n 4151 /// 001: assign values from bits [31:16] of \a a. \n 4152 /// 010: assign values from bits [47:32] of \a a. \n 4153 /// 011: assign values from bits [63:48] of \a a. \n 4154 /// 100: assign values from bits [79:64] of \a a. \n 4155 /// 101: assign values from bits [95:80] of \a a. \n 4156 /// 110: assign values from bits [111:96] of \a a. \n 4157 /// 111: assign values from bits [127:112] of \a a. 4158 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4159 /// integer vector parameter and the remaining bits are assigned zeros. 4160 #define _mm_extract_epi16(a, imm) \ 4161 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4162 (int)(imm))) 4163 4164 /// Constructs a 128-bit integer vector by first making a copy of the 4165 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4166 /// of an integer parameter into an offset specified by the immediate-value 4167 /// parameter. 4168 /// 4169 /// \headerfile <x86intrin.h> 4170 /// 4171 /// \code 4172 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm); 4173 /// \endcode 4174 /// 4175 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4176 /// 4177 /// \param a 4178 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4179 /// result and then one of the eight elements in the result is replaced by 4180 /// the lower 16 bits of \a b. 4181 /// \param b 4182 /// An integer. The lower 16 bits of this parameter are written to the 4183 /// result beginning at an offset specified by \a imm. 4184 /// \param imm 4185 /// An immediate value specifying the bit offset in the result at which the 4186 /// lower 16 bits of \a b are written. 4187 /// \returns A 128-bit integer vector containing the constructed values. 4188 #define _mm_insert_epi16(a, b, imm) \ 4189 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4190 (int)(imm))) 4191 4192 /// Copies the values of the most significant bits from each 8-bit 4193 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4194 /// value, zero-extends the value, and writes it to the destination. 4195 /// 4196 /// \headerfile <x86intrin.h> 4197 /// 4198 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4199 /// 4200 /// \param __a 4201 /// A 128-bit integer vector containing the values with bits to be extracted. 4202 /// \returns The most significant bits from each 8-bit element in \a __a, 4203 /// written to bits [15:0]. The other bits are assigned zeros. 4204 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { 4205 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4206 } 4207 4208 /// Constructs a 128-bit integer vector by shuffling four 32-bit 4209 /// elements of a 128-bit integer vector parameter, using the immediate-value 4210 /// parameter as a specifier. 4211 /// 4212 /// \headerfile <x86intrin.h> 4213 /// 4214 /// \code 4215 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4216 /// \endcode 4217 /// 4218 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4219 /// 4220 /// \param a 4221 /// A 128-bit integer vector containing the values to be copied. 4222 /// \param imm 4223 /// An immediate value containing an 8-bit value specifying which elements to 4224 /// copy from a. The destinations within the 128-bit destination are assigned 4225 /// values as follows: \n 4226 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4227 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4228 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4229 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4230 /// Bit value assignments: \n 4231 /// 00: assign values from bits [31:0] of \a a. \n 4232 /// 01: assign values from bits [63:32] of \a a. \n 4233 /// 10: assign values from bits [95:64] of \a a. \n 4234 /// 11: assign values from bits [127:96] of \a a. \n 4235 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4236 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4237 /// <c>[b6, b4, b2, b0]</c>. 4238 /// \returns A 128-bit integer vector containing the shuffled values. 4239 #define _mm_shuffle_epi32(a, imm) \ 4240 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 4241 4242 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4243 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4244 /// value parameter as a specifier. 4245 /// 4246 /// \headerfile <x86intrin.h> 4247 /// 4248 /// \code 4249 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4250 /// \endcode 4251 /// 4252 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4253 /// 4254 /// \param a 4255 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4256 /// [127:64] of the result. 4257 /// \param imm 4258 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4259 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4260 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4261 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4262 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4263 /// Bit value assignments: \n 4264 /// 00: assign values from bits [15:0] of \a a. \n 4265 /// 01: assign values from bits [31:16] of \a a. \n 4266 /// 10: assign values from bits [47:32] of \a a. \n 4267 /// 11: assign values from bits [63:48] of \a a. \n 4268 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4269 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4270 /// <c>[b6, b4, b2, b0]</c>. 4271 /// \returns A 128-bit integer vector containing the shuffled values. 4272 #define _mm_shufflelo_epi16(a, imm) \ 4273 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 4274 4275 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4276 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4277 /// value parameter as a specifier. 4278 /// 4279 /// \headerfile <x86intrin.h> 4280 /// 4281 /// \code 4282 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4283 /// \endcode 4284 /// 4285 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4286 /// 4287 /// \param a 4288 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4289 /// [63:0] of the result. 4290 /// \param imm 4291 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4292 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4293 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4294 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4295 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4296 /// Bit value assignments: \n 4297 /// 00: assign values from bits [79:64] of \a a. \n 4298 /// 01: assign values from bits [95:80] of \a a. \n 4299 /// 10: assign values from bits [111:96] of \a a. \n 4300 /// 11: assign values from bits [127:112] of \a a. \n 4301 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4302 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4303 /// <c>[b6, b4, b2, b0]</c>. 4304 /// \returns A 128-bit integer vector containing the shuffled values. 4305 #define _mm_shufflehi_epi16(a, imm) \ 4306 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 4307 4308 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4309 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4310 /// 4311 /// \headerfile <x86intrin.h> 4312 /// 4313 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4314 /// instruction. 4315 /// 4316 /// \param __a 4317 /// A 128-bit vector of [16 x i8]. 4318 /// Bits [71:64] are written to bits [7:0] of the result. \n 4319 /// Bits [79:72] are written to bits [23:16] of the result. \n 4320 /// Bits [87:80] are written to bits [39:32] of the result. \n 4321 /// Bits [95:88] are written to bits [55:48] of the result. \n 4322 /// Bits [103:96] are written to bits [71:64] of the result. \n 4323 /// Bits [111:104] are written to bits [87:80] of the result. \n 4324 /// Bits [119:112] are written to bits [103:96] of the result. \n 4325 /// Bits [127:120] are written to bits [119:112] of the result. 4326 /// \param __b 4327 /// A 128-bit vector of [16 x i8]. \n 4328 /// Bits [71:64] are written to bits [15:8] of the result. \n 4329 /// Bits [79:72] are written to bits [31:24] of the result. \n 4330 /// Bits [87:80] are written to bits [47:40] of the result. \n 4331 /// Bits [95:88] are written to bits [63:56] of the result. \n 4332 /// Bits [103:96] are written to bits [79:72] of the result. \n 4333 /// Bits [111:104] are written to bits [95:88] of the result. \n 4334 /// Bits [119:112] are written to bits [111:104] of the result. \n 4335 /// Bits [127:120] are written to bits [127:120] of the result. 4336 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4337 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, 4338 __m128i __b) { 4339 return (__m128i)__builtin_shufflevector( 4340 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 4341 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); 4342 } 4343 4344 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4345 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4346 /// 4347 /// \headerfile <x86intrin.h> 4348 /// 4349 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4350 /// instruction. 4351 /// 4352 /// \param __a 4353 /// A 128-bit vector of [8 x i16]. 4354 /// Bits [79:64] are written to bits [15:0] of the result. \n 4355 /// Bits [95:80] are written to bits [47:32] of the result. \n 4356 /// Bits [111:96] are written to bits [79:64] of the result. \n 4357 /// Bits [127:112] are written to bits [111:96] of the result. 4358 /// \param __b 4359 /// A 128-bit vector of [8 x i16]. 4360 /// Bits [79:64] are written to bits [31:16] of the result. \n 4361 /// Bits [95:80] are written to bits [63:48] of the result. \n 4362 /// Bits [111:96] are written to bits [95:80] of the result. \n 4363 /// Bits [127:112] are written to bits [127:112] of the result. 4364 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4365 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, 4366 __m128i __b) { 4367 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5, 4368 8 + 5, 6, 8 + 6, 7, 8 + 7); 4369 } 4370 4371 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4372 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4373 /// 4374 /// \headerfile <x86intrin.h> 4375 /// 4376 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4377 /// instruction. 4378 /// 4379 /// \param __a 4380 /// A 128-bit vector of [4 x i32]. \n 4381 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4382 /// Bits [127:96] are written to bits [95:64] of the destination. 4383 /// \param __b 4384 /// A 128-bit vector of [4 x i32]. \n 4385 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4386 /// Bits [127:96] are written to bits [127:96] of the destination. 4387 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4388 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, 4389 __m128i __b) { 4390 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3, 4391 4 + 3); 4392 } 4393 4394 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4395 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4396 /// 4397 /// \headerfile <x86intrin.h> 4398 /// 4399 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4400 /// instruction. 4401 /// 4402 /// \param __a 4403 /// A 128-bit vector of [2 x i64]. \n 4404 /// Bits [127:64] are written to bits [63:0] of the destination. 4405 /// \param __b 4406 /// A 128-bit vector of [2 x i64]. \n 4407 /// Bits [127:64] are written to bits [127:64] of the destination. 4408 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4409 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, 4410 __m128i __b) { 4411 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1); 4412 } 4413 4414 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4415 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4416 /// 4417 /// \headerfile <x86intrin.h> 4418 /// 4419 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4420 /// instruction. 4421 /// 4422 /// \param __a 4423 /// A 128-bit vector of [16 x i8]. \n 4424 /// Bits [7:0] are written to bits [7:0] of the result. \n 4425 /// Bits [15:8] are written to bits [23:16] of the result. \n 4426 /// Bits [23:16] are written to bits [39:32] of the result. \n 4427 /// Bits [31:24] are written to bits [55:48] of the result. \n 4428 /// Bits [39:32] are written to bits [71:64] of the result. \n 4429 /// Bits [47:40] are written to bits [87:80] of the result. \n 4430 /// Bits [55:48] are written to bits [103:96] of the result. \n 4431 /// Bits [63:56] are written to bits [119:112] of the result. 4432 /// \param __b 4433 /// A 128-bit vector of [16 x i8]. 4434 /// Bits [7:0] are written to bits [15:8] of the result. \n 4435 /// Bits [15:8] are written to bits [31:24] of the result. \n 4436 /// Bits [23:16] are written to bits [47:40] of the result. \n 4437 /// Bits [31:24] are written to bits [63:56] of the result. \n 4438 /// Bits [39:32] are written to bits [79:72] of the result. \n 4439 /// Bits [47:40] are written to bits [95:88] of the result. \n 4440 /// Bits [55:48] are written to bits [111:104] of the result. \n 4441 /// Bits [63:56] are written to bits [127:120] of the result. 4442 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4443 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, 4444 __m128i __b) { 4445 return (__m128i)__builtin_shufflevector( 4446 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 4447 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); 4448 } 4449 4450 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4451 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4452 /// [8 x i16]. 4453 /// 4454 /// \headerfile <x86intrin.h> 4455 /// 4456 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4457 /// instruction. 4458 /// 4459 /// \param __a 4460 /// A 128-bit vector of [8 x i16]. 4461 /// Bits [15:0] are written to bits [15:0] of the result. \n 4462 /// Bits [31:16] are written to bits [47:32] of the result. \n 4463 /// Bits [47:32] are written to bits [79:64] of the result. \n 4464 /// Bits [63:48] are written to bits [111:96] of the result. 4465 /// \param __b 4466 /// A 128-bit vector of [8 x i16]. 4467 /// Bits [15:0] are written to bits [31:16] of the result. \n 4468 /// Bits [31:16] are written to bits [63:48] of the result. \n 4469 /// Bits [47:32] are written to bits [95:80] of the result. \n 4470 /// Bits [63:48] are written to bits [127:112] of the result. 4471 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4472 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, 4473 __m128i __b) { 4474 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1, 4475 8 + 1, 2, 8 + 2, 3, 8 + 3); 4476 } 4477 4478 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4479 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4480 /// 4481 /// \headerfile <x86intrin.h> 4482 /// 4483 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4484 /// instruction. 4485 /// 4486 /// \param __a 4487 /// A 128-bit vector of [4 x i32]. \n 4488 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4489 /// Bits [63:32] are written to bits [95:64] of the destination. 4490 /// \param __b 4491 /// A 128-bit vector of [4 x i32]. \n 4492 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4493 /// Bits [63:32] are written to bits [127:96] of the destination. 4494 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4495 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, 4496 __m128i __b) { 4497 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1, 4498 4 + 1); 4499 } 4500 4501 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4502 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4503 /// 4504 /// \headerfile <x86intrin.h> 4505 /// 4506 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4507 /// instruction. 4508 /// 4509 /// \param __a 4510 /// A 128-bit vector of [2 x i64]. \n 4511 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4512 /// \param __b 4513 /// A 128-bit vector of [2 x i64]. \n 4514 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4515 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4516 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, 4517 __m128i __b) { 4518 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0); 4519 } 4520 4521 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4522 /// integer. 4523 /// 4524 /// \headerfile <x86intrin.h> 4525 /// 4526 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4527 /// 4528 /// \param __a 4529 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4530 /// destination. 4531 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4532 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { 4533 return (__m64)__a[0]; 4534 } 4535 4536 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4537 /// upper bits. 4538 /// 4539 /// \headerfile <x86intrin.h> 4540 /// 4541 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4542 /// 4543 /// \param __a 4544 /// A 64-bit value. 4545 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4546 /// the operand. The upper 64 bits are assigned zeros. 4547 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { 4548 return __extension__(__m128i)(__v2di){(long long)__a, 0}; 4549 } 4550 4551 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4552 /// integer vector, zeroing the upper bits. 4553 /// 4554 /// \headerfile <x86intrin.h> 4555 /// 4556 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4557 /// 4558 /// \param __a 4559 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4560 /// destination. 4561 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4562 /// the operand. The upper 64 bits are assigned zeros. 4563 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { 4564 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4565 } 4566 4567 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4568 /// [2 x double] and interleaves them into a 128-bit vector of [2 x 4569 /// double]. 4570 /// 4571 /// \headerfile <x86intrin.h> 4572 /// 4573 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4574 /// 4575 /// \param __a 4576 /// A 128-bit vector of [2 x double]. \n 4577 /// Bits [127:64] are written to bits [63:0] of the destination. 4578 /// \param __b 4579 /// A 128-bit vector of [2 x double]. \n 4580 /// Bits [127:64] are written to bits [127:64] of the destination. 4581 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4582 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, 4583 __m128d __b) { 4584 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1); 4585 } 4586 4587 /// Unpacks the low-order 64-bit elements from two 128-bit vectors 4588 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4589 /// double]. 4590 /// 4591 /// \headerfile <x86intrin.h> 4592 /// 4593 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4594 /// 4595 /// \param __a 4596 /// A 128-bit vector of [2 x double]. \n 4597 /// Bits [63:0] are written to bits [63:0] of the destination. 4598 /// \param __b 4599 /// A 128-bit vector of [2 x double]. \n 4600 /// Bits [63:0] are written to bits [127:64] of the destination. 4601 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4602 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, 4603 __m128d __b) { 4604 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0); 4605 } 4606 4607 /// Extracts the sign bits of the double-precision values in the 128-bit 4608 /// vector of [2 x double], zero-extends the value, and writes it to the 4609 /// low-order bits of the destination. 4610 /// 4611 /// \headerfile <x86intrin.h> 4612 /// 4613 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4614 /// 4615 /// \param __a 4616 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4617 /// be extracted. 4618 /// \returns The sign bits from each of the double-precision elements in \a __a, 4619 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4620 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { 4621 return __builtin_ia32_movmskpd((__v2df)__a); 4622 } 4623 4624 /// Constructs a 128-bit floating-point vector of [2 x double] from two 4625 /// 128-bit vector parameters of [2 x double], using the immediate-value 4626 /// parameter as a specifier. 4627 /// 4628 /// \headerfile <x86intrin.h> 4629 /// 4630 /// \code 4631 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4632 /// \endcode 4633 /// 4634 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4635 /// 4636 /// \param a 4637 /// A 128-bit vector of [2 x double]. 4638 /// \param b 4639 /// A 128-bit vector of [2 x double]. 4640 /// \param i 4641 /// An 8-bit immediate value. The least significant two bits specify which 4642 /// elements to copy from \a a and \a b: \n 4643 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4644 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4645 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4646 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4647 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro. 4648 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form 4649 /// <c>[b1, b0]</c>. 4650 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4651 #define _mm_shuffle_pd(a, b, i) \ 4652 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4653 (int)(i))) 4654 4655 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4656 /// floating-point vector of [4 x float]. 4657 /// 4658 /// \headerfile <x86intrin.h> 4659 /// 4660 /// This intrinsic has no corresponding instruction. 4661 /// 4662 /// \param __a 4663 /// A 128-bit floating-point vector of [2 x double]. 4664 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4665 /// bitwise pattern as the parameter. 4666 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { 4667 return (__m128)__a; 4668 } 4669 4670 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4671 /// integer vector. 4672 /// 4673 /// \headerfile <x86intrin.h> 4674 /// 4675 /// This intrinsic has no corresponding instruction. 4676 /// 4677 /// \param __a 4678 /// A 128-bit floating-point vector of [2 x double]. 4679 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4680 /// parameter. 4681 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { 4682 return (__m128i)__a; 4683 } 4684 4685 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4686 /// floating-point vector of [2 x double]. 4687 /// 4688 /// \headerfile <x86intrin.h> 4689 /// 4690 /// This intrinsic has no corresponding instruction. 4691 /// 4692 /// \param __a 4693 /// A 128-bit floating-point vector of [4 x float]. 4694 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4695 /// bitwise pattern as the parameter. 4696 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { 4697 return (__m128d)__a; 4698 } 4699 4700 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4701 /// integer vector. 4702 /// 4703 /// \headerfile <x86intrin.h> 4704 /// 4705 /// This intrinsic has no corresponding instruction. 4706 /// 4707 /// \param __a 4708 /// A 128-bit floating-point vector of [4 x float]. 4709 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4710 /// parameter. 4711 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { 4712 return (__m128i)__a; 4713 } 4714 4715 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4716 /// of [4 x float]. 4717 /// 4718 /// \headerfile <x86intrin.h> 4719 /// 4720 /// This intrinsic has no corresponding instruction. 4721 /// 4722 /// \param __a 4723 /// A 128-bit integer vector. 4724 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4725 /// bitwise pattern as the parameter. 4726 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { 4727 return (__m128)__a; 4728 } 4729 4730 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4731 /// of [2 x double]. 4732 /// 4733 /// \headerfile <x86intrin.h> 4734 /// 4735 /// This intrinsic has no corresponding instruction. 4736 /// 4737 /// \param __a 4738 /// A 128-bit integer vector. 4739 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4740 /// bitwise pattern as the parameter. 4741 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) { 4742 return (__m128d)__a; 4743 } 4744 4745 #if defined(__cplusplus) 4746 extern "C" { 4747 #endif 4748 4749 /// Indicates that a spin loop is being executed for the purposes of 4750 /// optimizing power consumption during the loop. 4751 /// 4752 /// \headerfile <x86intrin.h> 4753 /// 4754 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4755 /// 4756 void _mm_pause(void); 4757 4758 #if defined(__cplusplus) 4759 } // extern "C" 4760 #endif 4761 #undef __DEFAULT_FN_ATTRS 4762 #undef __DEFAULT_FN_ATTRS_MMX 4763 4764 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4765 4766 #define _MM_DENORMALS_ZERO_ON (0x0040U) 4767 #define _MM_DENORMALS_ZERO_OFF (0x0000U) 4768 4769 #define _MM_DENORMALS_ZERO_MASK (0x0040U) 4770 4771 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4772 #define _MM_SET_DENORMALS_ZERO_MODE(x) \ 4773 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4774 4775 #endif /* __EMMINTRIN_H */ 4776