1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __EMMINTRIN_H 11 #define __EMMINTRIN_H 12 13 #if !defined(__i386__) && !defined(__x86_64__) 14 #error "This header is only meant to be used on x86 and x64 architecture" 15 #endif 16 17 #include <xmmintrin.h> 18 19 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); 20 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); 21 22 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); 23 typedef long long __m128i_u 24 __attribute__((__vector_size__(16), __aligned__(1))); 25 26 /* Type defines. */ 27 typedef double __v2df __attribute__((__vector_size__(16))); 28 typedef long long __v2di __attribute__((__vector_size__(16))); 29 typedef short __v8hi __attribute__((__vector_size__(16))); 30 typedef char __v16qi __attribute__((__vector_size__(16))); 31 32 /* Unsigned types */ 33 typedef unsigned long long __v2du __attribute__((__vector_size__(16))); 34 typedef unsigned short __v8hu __attribute__((__vector_size__(16))); 35 typedef unsigned char __v16qu __attribute__((__vector_size__(16))); 36 37 /* We need an explicitly signed variant for char. Note that this shouldn't 38 * appear in the interface though. */ 39 typedef signed char __v16qs __attribute__((__vector_size__(16))); 40 41 /* Define the default attributes for the functions in this file. */ 42 #define __DEFAULT_FN_ATTRS \ 43 __attribute__((__always_inline__, __nodebug__, __target__("sse2"), \ 44 __min_vector_width__(128))) 45 #define __DEFAULT_FN_ATTRS_MMX \ 46 __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \ 47 __min_vector_width__(64))) 48 49 /// Adds lower double-precision values in both operands and returns the 50 /// sum in the lower 64 bits of the result. The upper 64 bits of the result 51 /// are copied from the upper double-precision value of the first operand. 52 /// 53 /// \headerfile <x86intrin.h> 54 /// 55 /// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction. 56 /// 57 /// \param __a 58 /// A 128-bit vector of [2 x double] containing one of the source operands. 59 /// \param __b 60 /// A 128-bit vector of [2 x double] containing one of the source operands. 61 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 62 /// sum of the lower 64 bits of both operands. The upper 64 bits are copied 63 /// from the upper 64 bits of the first source operand. 64 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a, 65 __m128d __b) { 66 __a[0] += __b[0]; 67 return __a; 68 } 69 70 /// Adds two 128-bit vectors of [2 x double]. 71 /// 72 /// \headerfile <x86intrin.h> 73 /// 74 /// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction. 75 /// 76 /// \param __a 77 /// A 128-bit vector of [2 x double] containing one of the source operands. 78 /// \param __b 79 /// A 128-bit vector of [2 x double] containing one of the source operands. 80 /// \returns A 128-bit vector of [2 x double] containing the sums of both 81 /// operands. 82 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a, 83 __m128d __b) { 84 return (__m128d)((__v2df)__a + (__v2df)__b); 85 } 86 87 /// Subtracts the lower double-precision value of the second operand 88 /// from the lower double-precision value of the first operand and returns 89 /// the difference in the lower 64 bits of the result. The upper 64 bits of 90 /// the result are copied from the upper double-precision value of the first 91 /// operand. 92 /// 93 /// \headerfile <x86intrin.h> 94 /// 95 /// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction. 96 /// 97 /// \param __a 98 /// A 128-bit vector of [2 x double] containing the minuend. 99 /// \param __b 100 /// A 128-bit vector of [2 x double] containing the subtrahend. 101 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 102 /// difference of the lower 64 bits of both operands. The upper 64 bits are 103 /// copied from the upper 64 bits of the first source operand. 104 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a, 105 __m128d __b) { 106 __a[0] -= __b[0]; 107 return __a; 108 } 109 110 /// Subtracts two 128-bit vectors of [2 x double]. 111 /// 112 /// \headerfile <x86intrin.h> 113 /// 114 /// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction. 115 /// 116 /// \param __a 117 /// A 128-bit vector of [2 x double] containing the minuend. 118 /// \param __b 119 /// A 128-bit vector of [2 x double] containing the subtrahend. 120 /// \returns A 128-bit vector of [2 x double] containing the differences between 121 /// both operands. 122 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a, 123 __m128d __b) { 124 return (__m128d)((__v2df)__a - (__v2df)__b); 125 } 126 127 /// Multiplies lower double-precision values in both operands and returns 128 /// the product in the lower 64 bits of the result. The upper 64 bits of the 129 /// result are copied from the upper double-precision value of the first 130 /// operand. 131 /// 132 /// \headerfile <x86intrin.h> 133 /// 134 /// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction. 135 /// 136 /// \param __a 137 /// A 128-bit vector of [2 x double] containing one of the source operands. 138 /// \param __b 139 /// A 128-bit vector of [2 x double] containing one of the source operands. 140 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 141 /// product of the lower 64 bits of both operands. The upper 64 bits are 142 /// copied from the upper 64 bits of the first source operand. 143 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a, 144 __m128d __b) { 145 __a[0] *= __b[0]; 146 return __a; 147 } 148 149 /// Multiplies two 128-bit vectors of [2 x double]. 150 /// 151 /// \headerfile <x86intrin.h> 152 /// 153 /// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction. 154 /// 155 /// \param __a 156 /// A 128-bit vector of [2 x double] containing one of the operands. 157 /// \param __b 158 /// A 128-bit vector of [2 x double] containing one of the operands. 159 /// \returns A 128-bit vector of [2 x double] containing the products of both 160 /// operands. 161 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a, 162 __m128d __b) { 163 return (__m128d)((__v2df)__a * (__v2df)__b); 164 } 165 166 /// Divides the lower double-precision value of the first operand by the 167 /// lower double-precision value of the second operand and returns the 168 /// quotient in the lower 64 bits of the result. The upper 64 bits of the 169 /// result are copied from the upper double-precision value of the first 170 /// operand. 171 /// 172 /// \headerfile <x86intrin.h> 173 /// 174 /// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction. 175 /// 176 /// \param __a 177 /// A 128-bit vector of [2 x double] containing the dividend. 178 /// \param __b 179 /// A 128-bit vector of [2 x double] containing divisor. 180 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 181 /// quotient of the lower 64 bits of both operands. The upper 64 bits are 182 /// copied from the upper 64 bits of the first source operand. 183 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a, 184 __m128d __b) { 185 __a[0] /= __b[0]; 186 return __a; 187 } 188 189 /// Performs an element-by-element division of two 128-bit vectors of 190 /// [2 x double]. 191 /// 192 /// \headerfile <x86intrin.h> 193 /// 194 /// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction. 195 /// 196 /// \param __a 197 /// A 128-bit vector of [2 x double] containing the dividend. 198 /// \param __b 199 /// A 128-bit vector of [2 x double] containing the divisor. 200 /// \returns A 128-bit vector of [2 x double] containing the quotients of both 201 /// operands. 202 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a, 203 __m128d __b) { 204 return (__m128d)((__v2df)__a / (__v2df)__b); 205 } 206 207 /// Calculates the square root of the lower double-precision value of 208 /// the second operand and returns it in the lower 64 bits of the result. 209 /// The upper 64 bits of the result are copied from the upper 210 /// double-precision value of the first operand. 211 /// 212 /// \headerfile <x86intrin.h> 213 /// 214 /// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction. 215 /// 216 /// \param __a 217 /// A 128-bit vector of [2 x double] containing one of the operands. The 218 /// upper 64 bits of this operand are copied to the upper 64 bits of the 219 /// result. 220 /// \param __b 221 /// A 128-bit vector of [2 x double] containing one of the operands. The 222 /// square root is calculated using the lower 64 bits of this operand. 223 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 224 /// square root of the lower 64 bits of operand \a __b, and whose upper 64 225 /// bits are copied from the upper 64 bits of operand \a __a. 226 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a, 227 __m128d __b) { 228 __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); 229 return __extension__(__m128d){__c[0], __a[1]}; 230 } 231 232 /// Calculates the square root of the each of two values stored in a 233 /// 128-bit vector of [2 x double]. 234 /// 235 /// \headerfile <x86intrin.h> 236 /// 237 /// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction. 238 /// 239 /// \param __a 240 /// A 128-bit vector of [2 x double]. 241 /// \returns A 128-bit vector of [2 x double] containing the square roots of the 242 /// values in the operand. 243 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) { 244 return __builtin_ia32_sqrtpd((__v2df)__a); 245 } 246 247 /// Compares lower 64-bit double-precision values of both operands, and 248 /// returns the lesser of the pair of values in the lower 64-bits of the 249 /// result. The upper 64 bits of the result are copied from the upper 250 /// double-precision value of the first operand. 251 /// 252 /// \headerfile <x86intrin.h> 253 /// 254 /// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction. 255 /// 256 /// \param __a 257 /// A 128-bit vector of [2 x double] containing one of the operands. The 258 /// lower 64 bits of this operand are used in the comparison. 259 /// \param __b 260 /// A 128-bit vector of [2 x double] containing one of the operands. The 261 /// lower 64 bits of this operand are used in the comparison. 262 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 263 /// minimum value between both operands. The upper 64 bits are copied from 264 /// the upper 64 bits of the first source operand. 265 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a, 266 __m128d __b) { 267 return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); 268 } 269 270 /// Performs element-by-element comparison of the two 128-bit vectors of 271 /// [2 x double] and returns the vector containing the lesser of each pair of 272 /// values. 273 /// 274 /// \headerfile <x86intrin.h> 275 /// 276 /// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction. 277 /// 278 /// \param __a 279 /// A 128-bit vector of [2 x double] containing one of the operands. 280 /// \param __b 281 /// A 128-bit vector of [2 x double] containing one of the operands. 282 /// \returns A 128-bit vector of [2 x double] containing the minimum values 283 /// between both operands. 284 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a, 285 __m128d __b) { 286 return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); 287 } 288 289 /// Compares lower 64-bit double-precision values of both operands, and 290 /// returns the greater of the pair of values in the lower 64-bits of the 291 /// result. The upper 64 bits of the result are copied from the upper 292 /// double-precision value of the first operand. 293 /// 294 /// \headerfile <x86intrin.h> 295 /// 296 /// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction. 297 /// 298 /// \param __a 299 /// A 128-bit vector of [2 x double] containing one of the operands. The 300 /// lower 64 bits of this operand are used in the comparison. 301 /// \param __b 302 /// A 128-bit vector of [2 x double] containing one of the operands. The 303 /// lower 64 bits of this operand are used in the comparison. 304 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 305 /// maximum value between both operands. The upper 64 bits are copied from 306 /// the upper 64 bits of the first source operand. 307 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a, 308 __m128d __b) { 309 return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); 310 } 311 312 /// Performs element-by-element comparison of the two 128-bit vectors of 313 /// [2 x double] and returns the vector containing the greater of each pair 314 /// of values. 315 /// 316 /// \headerfile <x86intrin.h> 317 /// 318 /// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction. 319 /// 320 /// \param __a 321 /// A 128-bit vector of [2 x double] containing one of the operands. 322 /// \param __b 323 /// A 128-bit vector of [2 x double] containing one of the operands. 324 /// \returns A 128-bit vector of [2 x double] containing the maximum values 325 /// between both operands. 326 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a, 327 __m128d __b) { 328 return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); 329 } 330 331 /// Performs a bitwise AND of two 128-bit vectors of [2 x double]. 332 /// 333 /// \headerfile <x86intrin.h> 334 /// 335 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 336 /// 337 /// \param __a 338 /// A 128-bit vector of [2 x double] containing one of the source operands. 339 /// \param __b 340 /// A 128-bit vector of [2 x double] containing one of the source operands. 341 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 342 /// values between both operands. 343 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a, 344 __m128d __b) { 345 return (__m128d)((__v2du)__a & (__v2du)__b); 346 } 347 348 /// Performs a bitwise AND of two 128-bit vectors of [2 x double], using 349 /// the one's complement of the values contained in the first source operand. 350 /// 351 /// \headerfile <x86intrin.h> 352 /// 353 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 354 /// 355 /// \param __a 356 /// A 128-bit vector of [2 x double] containing the left source operand. The 357 /// one's complement of this value is used in the bitwise AND. 358 /// \param __b 359 /// A 128-bit vector of [2 x double] containing the right source operand. 360 /// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the 361 /// values in the second operand and the one's complement of the first 362 /// operand. 363 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a, 364 __m128d __b) { 365 return (__m128d)(~(__v2du)__a & (__v2du)__b); 366 } 367 368 /// Performs a bitwise OR of two 128-bit vectors of [2 x double]. 369 /// 370 /// \headerfile <x86intrin.h> 371 /// 372 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 373 /// 374 /// \param __a 375 /// A 128-bit vector of [2 x double] containing one of the source operands. 376 /// \param __b 377 /// A 128-bit vector of [2 x double] containing one of the source operands. 378 /// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the 379 /// values between both operands. 380 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a, 381 __m128d __b) { 382 return (__m128d)((__v2du)__a | (__v2du)__b); 383 } 384 385 /// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. 386 /// 387 /// \headerfile <x86intrin.h> 388 /// 389 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 390 /// 391 /// \param __a 392 /// A 128-bit vector of [2 x double] containing one of the source operands. 393 /// \param __b 394 /// A 128-bit vector of [2 x double] containing one of the source operands. 395 /// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the 396 /// values between both operands. 397 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a, 398 __m128d __b) { 399 return (__m128d)((__v2du)__a ^ (__v2du)__b); 400 } 401 402 /// Compares each of the corresponding double-precision values of the 403 /// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 404 /// for false, 0xFFFFFFFFFFFFFFFF for true. 405 /// 406 /// \headerfile <x86intrin.h> 407 /// 408 /// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction. 409 /// 410 /// \param __a 411 /// A 128-bit vector of [2 x double]. 412 /// \param __b 413 /// A 128-bit vector of [2 x double]. 414 /// \returns A 128-bit vector containing the comparison results. 415 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a, 416 __m128d __b) { 417 return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); 418 } 419 420 /// Compares each of the corresponding double-precision values of the 421 /// 128-bit vectors of [2 x double] to determine if the values in the first 422 /// operand are less than those in the second operand. Each comparison 423 /// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 424 /// 425 /// \headerfile <x86intrin.h> 426 /// 427 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 428 /// 429 /// \param __a 430 /// A 128-bit vector of [2 x double]. 431 /// \param __b 432 /// A 128-bit vector of [2 x double]. 433 /// \returns A 128-bit vector containing the comparison results. 434 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a, 435 __m128d __b) { 436 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); 437 } 438 439 /// Compares each of the corresponding double-precision values of the 440 /// 128-bit vectors of [2 x double] to determine if the values in the first 441 /// operand are less than or equal to those in the second operand. 442 /// 443 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 444 /// 445 /// \headerfile <x86intrin.h> 446 /// 447 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 448 /// 449 /// \param __a 450 /// A 128-bit vector of [2 x double]. 451 /// \param __b 452 /// A 128-bit vector of [2 x double]. 453 /// \returns A 128-bit vector containing the comparison results. 454 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a, 455 __m128d __b) { 456 return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); 457 } 458 459 /// Compares each of the corresponding double-precision values of the 460 /// 128-bit vectors of [2 x double] to determine if the values in the first 461 /// operand are greater than those in the second operand. 462 /// 463 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 464 /// 465 /// \headerfile <x86intrin.h> 466 /// 467 /// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction. 468 /// 469 /// \param __a 470 /// A 128-bit vector of [2 x double]. 471 /// \param __b 472 /// A 128-bit vector of [2 x double]. 473 /// \returns A 128-bit vector containing the comparison results. 474 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a, 475 __m128d __b) { 476 return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); 477 } 478 479 /// Compares each of the corresponding double-precision values of the 480 /// 128-bit vectors of [2 x double] to determine if the values in the first 481 /// operand are greater than or equal to those in the second operand. 482 /// 483 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 484 /// 485 /// \headerfile <x86intrin.h> 486 /// 487 /// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction. 488 /// 489 /// \param __a 490 /// A 128-bit vector of [2 x double]. 491 /// \param __b 492 /// A 128-bit vector of [2 x double]. 493 /// \returns A 128-bit vector containing the comparison results. 494 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a, 495 __m128d __b) { 496 return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); 497 } 498 499 /// Compares each of the corresponding double-precision values of the 500 /// 128-bit vectors of [2 x double] to determine if the values in the first 501 /// operand are ordered with respect to those in the second operand. 502 /// 503 /// A pair of double-precision values are "ordered" with respect to each 504 /// other if neither value is a NaN. Each comparison yields 0x0 for false, 505 /// 0xFFFFFFFFFFFFFFFF for true. 506 /// 507 /// \headerfile <x86intrin.h> 508 /// 509 /// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction. 510 /// 511 /// \param __a 512 /// A 128-bit vector of [2 x double]. 513 /// \param __b 514 /// A 128-bit vector of [2 x double]. 515 /// \returns A 128-bit vector containing the comparison results. 516 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a, 517 __m128d __b) { 518 return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); 519 } 520 521 /// Compares each of the corresponding double-precision values of the 522 /// 128-bit vectors of [2 x double] to determine if the values in the first 523 /// operand are unordered with respect to those in the second operand. 524 /// 525 /// A pair of double-precision values are "unordered" with respect to each 526 /// other if one or both values are NaN. Each comparison yields 0x0 for 527 /// false, 0xFFFFFFFFFFFFFFFF for true. 528 /// 529 /// \headerfile <x86intrin.h> 530 /// 531 /// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c> 532 /// instruction. 533 /// 534 /// \param __a 535 /// A 128-bit vector of [2 x double]. 536 /// \param __b 537 /// A 128-bit vector of [2 x double]. 538 /// \returns A 128-bit vector containing the comparison results. 539 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a, 540 __m128d __b) { 541 return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); 542 } 543 544 /// Compares each of the corresponding double-precision values of the 545 /// 128-bit vectors of [2 x double] to determine if the values in the first 546 /// operand are unequal to those in the second operand. 547 /// 548 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 549 /// 550 /// \headerfile <x86intrin.h> 551 /// 552 /// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction. 553 /// 554 /// \param __a 555 /// A 128-bit vector of [2 x double]. 556 /// \param __b 557 /// A 128-bit vector of [2 x double]. 558 /// \returns A 128-bit vector containing the comparison results. 559 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a, 560 __m128d __b) { 561 return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); 562 } 563 564 /// Compares each of the corresponding double-precision values of the 565 /// 128-bit vectors of [2 x double] to determine if the values in the first 566 /// operand are not less than those in the second operand. 567 /// 568 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 569 /// 570 /// \headerfile <x86intrin.h> 571 /// 572 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 573 /// 574 /// \param __a 575 /// A 128-bit vector of [2 x double]. 576 /// \param __b 577 /// A 128-bit vector of [2 x double]. 578 /// \returns A 128-bit vector containing the comparison results. 579 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a, 580 __m128d __b) { 581 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); 582 } 583 584 /// Compares each of the corresponding double-precision values of the 585 /// 128-bit vectors of [2 x double] to determine if the values in the first 586 /// operand are not less than or equal to those in the second operand. 587 /// 588 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 589 /// 590 /// \headerfile <x86intrin.h> 591 /// 592 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 593 /// 594 /// \param __a 595 /// A 128-bit vector of [2 x double]. 596 /// \param __b 597 /// A 128-bit vector of [2 x double]. 598 /// \returns A 128-bit vector containing the comparison results. 599 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a, 600 __m128d __b) { 601 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); 602 } 603 604 /// Compares each of the corresponding double-precision values of the 605 /// 128-bit vectors of [2 x double] to determine if the values in the first 606 /// operand are not greater than those in the second operand. 607 /// 608 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 609 /// 610 /// \headerfile <x86intrin.h> 611 /// 612 /// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction. 613 /// 614 /// \param __a 615 /// A 128-bit vector of [2 x double]. 616 /// \param __b 617 /// A 128-bit vector of [2 x double]. 618 /// \returns A 128-bit vector containing the comparison results. 619 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a, 620 __m128d __b) { 621 return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); 622 } 623 624 /// Compares each of the corresponding double-precision values of the 625 /// 128-bit vectors of [2 x double] to determine if the values in the first 626 /// operand are not greater than or equal to those in the second operand. 627 /// 628 /// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 629 /// 630 /// \headerfile <x86intrin.h> 631 /// 632 /// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction. 633 /// 634 /// \param __a 635 /// A 128-bit vector of [2 x double]. 636 /// \param __b 637 /// A 128-bit vector of [2 x double]. 638 /// \returns A 128-bit vector containing the comparison results. 639 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a, 640 __m128d __b) { 641 return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); 642 } 643 644 /// Compares the lower double-precision floating-point values in each of 645 /// the two 128-bit floating-point vectors of [2 x double] for equality. 646 /// 647 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 648 /// 649 /// \headerfile <x86intrin.h> 650 /// 651 /// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction. 652 /// 653 /// \param __a 654 /// A 128-bit vector of [2 x double]. The lower double-precision value is 655 /// compared to the lower double-precision value of \a __b. 656 /// \param __b 657 /// A 128-bit vector of [2 x double]. The lower double-precision value is 658 /// compared to the lower double-precision value of \a __a. 659 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 660 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 661 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a, 662 __m128d __b) { 663 return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); 664 } 665 666 /// Compares the lower double-precision floating-point values in each of 667 /// the two 128-bit floating-point vectors of [2 x double] to determine if 668 /// the value in the first parameter is less than the corresponding value in 669 /// the second parameter. 670 /// 671 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 672 /// 673 /// \headerfile <x86intrin.h> 674 /// 675 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 676 /// 677 /// \param __a 678 /// A 128-bit vector of [2 x double]. The lower double-precision value is 679 /// compared to the lower double-precision value of \a __b. 680 /// \param __b 681 /// A 128-bit vector of [2 x double]. The lower double-precision value is 682 /// compared to the lower double-precision value of \a __a. 683 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 684 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 685 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a, 686 __m128d __b) { 687 return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); 688 } 689 690 /// Compares the lower double-precision floating-point values in each of 691 /// the two 128-bit floating-point vectors of [2 x double] to determine if 692 /// the value in the first parameter is less than or equal to the 693 /// corresponding value in the second parameter. 694 /// 695 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 696 /// 697 /// \headerfile <x86intrin.h> 698 /// 699 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 700 /// 701 /// \param __a 702 /// A 128-bit vector of [2 x double]. The lower double-precision value is 703 /// compared to the lower double-precision value of \a __b. 704 /// \param __b 705 /// A 128-bit vector of [2 x double]. The lower double-precision value is 706 /// compared to the lower double-precision value of \a __a. 707 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 708 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a, 710 __m128d __b) { 711 return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); 712 } 713 714 /// Compares the lower double-precision floating-point values in each of 715 /// the two 128-bit floating-point vectors of [2 x double] to determine if 716 /// the value in the first parameter is greater than the corresponding value 717 /// in the second parameter. 718 /// 719 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 720 /// 721 /// \headerfile <x86intrin.h> 722 /// 723 /// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction. 724 /// 725 /// \param __a 726 /// A 128-bit vector of [2 x double]. The lower double-precision value is 727 /// compared to the lower double-precision value of \a __b. 728 /// \param __b 729 /// A 128-bit vector of [2 x double]. The lower double-precision value is 730 /// compared to the lower double-precision value of \a __a. 731 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 732 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 733 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a, 734 __m128d __b) { 735 __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); 736 return __extension__(__m128d){__c[0], __a[1]}; 737 } 738 739 /// Compares the lower double-precision floating-point values in each of 740 /// the two 128-bit floating-point vectors of [2 x double] to determine if 741 /// the value in the first parameter is greater than or equal to the 742 /// corresponding value in the second parameter. 743 /// 744 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 745 /// 746 /// \headerfile <x86intrin.h> 747 /// 748 /// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction. 749 /// 750 /// \param __a 751 /// A 128-bit vector of [2 x double]. The lower double-precision value is 752 /// compared to the lower double-precision value of \a __b. 753 /// \param __b 754 /// A 128-bit vector of [2 x double]. The lower double-precision value is 755 /// compared to the lower double-precision value of \a __a. 756 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 757 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 758 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a, 759 __m128d __b) { 760 __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); 761 return __extension__(__m128d){__c[0], __a[1]}; 762 } 763 764 /// Compares the lower double-precision floating-point values in each of 765 /// the two 128-bit floating-point vectors of [2 x double] to determine if 766 /// the value in the first parameter is "ordered" with respect to the 767 /// corresponding value in the second parameter. 768 /// 769 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 770 /// of double-precision values are "ordered" with respect to each other if 771 /// neither value is a NaN. 772 /// 773 /// \headerfile <x86intrin.h> 774 /// 775 /// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction. 776 /// 777 /// \param __a 778 /// A 128-bit vector of [2 x double]. The lower double-precision value is 779 /// compared to the lower double-precision value of \a __b. 780 /// \param __b 781 /// A 128-bit vector of [2 x double]. The lower double-precision value is 782 /// compared to the lower double-precision value of \a __a. 783 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 784 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 785 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a, 786 __m128d __b) { 787 return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); 788 } 789 790 /// Compares the lower double-precision floating-point values in each of 791 /// the two 128-bit floating-point vectors of [2 x double] to determine if 792 /// the value in the first parameter is "unordered" with respect to the 793 /// corresponding value in the second parameter. 794 /// 795 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair 796 /// of double-precision values are "unordered" with respect to each other if 797 /// one or both values are NaN. 798 /// 799 /// \headerfile <x86intrin.h> 800 /// 801 /// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c> 802 /// instruction. 803 /// 804 /// \param __a 805 /// A 128-bit vector of [2 x double]. The lower double-precision value is 806 /// compared to the lower double-precision value of \a __b. 807 /// \param __b 808 /// A 128-bit vector of [2 x double]. The lower double-precision value is 809 /// compared to the lower double-precision value of \a __a. 810 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 811 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 812 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a, 813 __m128d __b) { 814 return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); 815 } 816 817 /// Compares the lower double-precision floating-point values in each of 818 /// the two 128-bit floating-point vectors of [2 x double] to determine if 819 /// the value in the first parameter is unequal to the corresponding value in 820 /// the second parameter. 821 /// 822 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 823 /// 824 /// \headerfile <x86intrin.h> 825 /// 826 /// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction. 827 /// 828 /// \param __a 829 /// A 128-bit vector of [2 x double]. The lower double-precision value is 830 /// compared to the lower double-precision value of \a __b. 831 /// \param __b 832 /// A 128-bit vector of [2 x double]. The lower double-precision value is 833 /// compared to the lower double-precision value of \a __a. 834 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 835 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 836 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a, 837 __m128d __b) { 838 return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); 839 } 840 841 /// Compares the lower double-precision floating-point values in each of 842 /// the two 128-bit floating-point vectors of [2 x double] to determine if 843 /// the value in the first parameter is not less than the corresponding 844 /// value in the second parameter. 845 /// 846 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 847 /// 848 /// \headerfile <x86intrin.h> 849 /// 850 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 851 /// 852 /// \param __a 853 /// A 128-bit vector of [2 x double]. The lower double-precision value is 854 /// compared to the lower double-precision value of \a __b. 855 /// \param __b 856 /// A 128-bit vector of [2 x double]. The lower double-precision value is 857 /// compared to the lower double-precision value of \a __a. 858 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 859 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 860 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a, 861 __m128d __b) { 862 return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); 863 } 864 865 /// Compares the lower double-precision floating-point values in each of 866 /// the two 128-bit floating-point vectors of [2 x double] to determine if 867 /// the value in the first parameter is not less than or equal to the 868 /// corresponding value in the second parameter. 869 /// 870 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 871 /// 872 /// \headerfile <x86intrin.h> 873 /// 874 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 875 /// 876 /// \param __a 877 /// A 128-bit vector of [2 x double]. The lower double-precision value is 878 /// compared to the lower double-precision value of \a __b. 879 /// \param __b 880 /// A 128-bit vector of [2 x double]. The lower double-precision value is 881 /// compared to the lower double-precision value of \a __a. 882 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 883 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 884 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a, 885 __m128d __b) { 886 return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); 887 } 888 889 /// Compares the lower double-precision floating-point values in each of 890 /// the two 128-bit floating-point vectors of [2 x double] to determine if 891 /// the value in the first parameter is not greater than the corresponding 892 /// value in the second parameter. 893 /// 894 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 895 /// 896 /// \headerfile <x86intrin.h> 897 /// 898 /// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction. 899 /// 900 /// \param __a 901 /// A 128-bit vector of [2 x double]. The lower double-precision value is 902 /// compared to the lower double-precision value of \a __b. 903 /// \param __b 904 /// A 128-bit vector of [2 x double]. The lower double-precision value is 905 /// compared to the lower double-precision value of \a __a. 906 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 907 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 908 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a, 909 __m128d __b) { 910 __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); 911 return __extension__(__m128d){__c[0], __a[1]}; 912 } 913 914 /// Compares the lower double-precision floating-point values in each of 915 /// the two 128-bit floating-point vectors of [2 x double] to determine if 916 /// the value in the first parameter is not greater than or equal to the 917 /// corresponding value in the second parameter. 918 /// 919 /// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. 920 /// 921 /// \headerfile <x86intrin.h> 922 /// 923 /// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction. 924 /// 925 /// \param __a 926 /// A 128-bit vector of [2 x double]. The lower double-precision value is 927 /// compared to the lower double-precision value of \a __b. 928 /// \param __b 929 /// A 128-bit vector of [2 x double]. The lower double-precision value is 930 /// compared to the lower double-precision value of \a __a. 931 /// \returns A 128-bit vector. The lower 64 bits contains the comparison 932 /// results. The upper 64 bits are copied from the upper 64 bits of \a __a. 933 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a, 934 __m128d __b) { 935 __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); 936 return __extension__(__m128d){__c[0], __a[1]}; 937 } 938 939 /// Compares the lower double-precision floating-point values in each of 940 /// the two 128-bit floating-point vectors of [2 x double] for equality. 941 /// 942 /// The comparison yields 0 for false, 1 for true. If either of the two 943 /// lower double-precision values is NaN, 0 is returned. 944 /// 945 /// \headerfile <x86intrin.h> 946 /// 947 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 948 /// 949 /// \param __a 950 /// A 128-bit vector of [2 x double]. The lower double-precision value is 951 /// compared to the lower double-precision value of \a __b. 952 /// \param __b 953 /// A 128-bit vector of [2 x double]. The lower double-precision value is 954 /// compared to the lower double-precision value of \a __a. 955 /// \returns An integer containing the comparison results. If either of the two 956 /// lower double-precision values is NaN, 0 is returned. 957 static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a, 958 __m128d __b) { 959 return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); 960 } 961 962 /// Compares the lower double-precision floating-point values in each of 963 /// the two 128-bit floating-point vectors of [2 x double] to determine if 964 /// the value in the first parameter is less than the corresponding value in 965 /// the second parameter. 966 /// 967 /// The comparison yields 0 for false, 1 for true. If either of the two 968 /// lower double-precision values is NaN, 0 is returned. 969 /// 970 /// \headerfile <x86intrin.h> 971 /// 972 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 973 /// 974 /// \param __a 975 /// A 128-bit vector of [2 x double]. The lower double-precision value is 976 /// compared to the lower double-precision value of \a __b. 977 /// \param __b 978 /// A 128-bit vector of [2 x double]. The lower double-precision value is 979 /// compared to the lower double-precision value of \a __a. 980 /// \returns An integer containing the comparison results. If either of the two 981 /// lower double-precision values is NaN, 0 is returned. 982 static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a, 983 __m128d __b) { 984 return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); 985 } 986 987 /// Compares the lower double-precision floating-point values in each of 988 /// the two 128-bit floating-point vectors of [2 x double] to determine if 989 /// the value in the first parameter is less than or equal to the 990 /// corresponding value in the second parameter. 991 /// 992 /// The comparison yields 0 for false, 1 for true. If either of the two 993 /// lower double-precision values is NaN, 0 is returned. 994 /// 995 /// \headerfile <x86intrin.h> 996 /// 997 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 998 /// 999 /// \param __a 1000 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1001 /// compared to the lower double-precision value of \a __b. 1002 /// \param __b 1003 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1004 /// compared to the lower double-precision value of \a __a. 1005 /// \returns An integer containing the comparison results. If either of the two 1006 /// lower double-precision values is NaN, 0 is returned. 1007 static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a, 1008 __m128d __b) { 1009 return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); 1010 } 1011 1012 /// Compares the lower double-precision floating-point values in each of 1013 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1014 /// the value in the first parameter is greater than the corresponding value 1015 /// in the second parameter. 1016 /// 1017 /// The comparison yields 0 for false, 1 for true. If either of the two 1018 /// lower double-precision values is NaN, 0 is returned. 1019 /// 1020 /// \headerfile <x86intrin.h> 1021 /// 1022 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1023 /// 1024 /// \param __a 1025 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1026 /// compared to the lower double-precision value of \a __b. 1027 /// \param __b 1028 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1029 /// compared to the lower double-precision value of \a __a. 1030 /// \returns An integer containing the comparison results. If either of the two 1031 /// lower double-precision values is NaN, 0 is returned. 1032 static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a, 1033 __m128d __b) { 1034 return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); 1035 } 1036 1037 /// Compares the lower double-precision floating-point values in each of 1038 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1039 /// the value in the first parameter is greater than or equal to the 1040 /// corresponding value in the second parameter. 1041 /// 1042 /// The comparison yields 0 for false, 1 for true. If either of the two 1043 /// lower double-precision values is NaN, 0 is returned. 1044 /// 1045 /// \headerfile <x86intrin.h> 1046 /// 1047 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1048 /// 1049 /// \param __a 1050 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1051 /// compared to the lower double-precision value of \a __b. 1052 /// \param __b 1053 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1054 /// compared to the lower double-precision value of \a __a. 1055 /// \returns An integer containing the comparison results. If either of the two 1056 /// lower double-precision values is NaN, 0 is returned. 1057 static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a, 1058 __m128d __b) { 1059 return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); 1060 } 1061 1062 /// Compares the lower double-precision floating-point values in each of 1063 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1064 /// the value in the first parameter is unequal to the corresponding value in 1065 /// the second parameter. 1066 /// 1067 /// The comparison yields 0 for false, 1 for true. If either of the two 1068 /// lower double-precision values is NaN, 1 is returned. 1069 /// 1070 /// \headerfile <x86intrin.h> 1071 /// 1072 /// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction. 1073 /// 1074 /// \param __a 1075 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1076 /// compared to the lower double-precision value of \a __b. 1077 /// \param __b 1078 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1079 /// compared to the lower double-precision value of \a __a. 1080 /// \returns An integer containing the comparison results. If either of the two 1081 /// lower double-precision values is NaN, 1 is returned. 1082 static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a, 1083 __m128d __b) { 1084 return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); 1085 } 1086 1087 /// Compares the lower double-precision floating-point values in each of 1088 /// the two 128-bit floating-point vectors of [2 x double] for equality. The 1089 /// comparison yields 0 for false, 1 for true. 1090 /// 1091 /// If either of the two lower double-precision values is NaN, 0 is returned. 1092 /// 1093 /// \headerfile <x86intrin.h> 1094 /// 1095 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1096 /// 1097 /// \param __a 1098 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1099 /// compared to the lower double-precision value of \a __b. 1100 /// \param __b 1101 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1102 /// compared to the lower double-precision value of \a __a. 1103 /// \returns An integer containing the comparison results. If either of the two 1104 /// lower double-precision values is NaN, 0 is returned. 1105 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a, 1106 __m128d __b) { 1107 return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); 1108 } 1109 1110 /// Compares the lower double-precision floating-point values in each of 1111 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1112 /// the value in the first parameter is less than the corresponding value in 1113 /// the second parameter. 1114 /// 1115 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1116 /// double-precision values is NaN, 0 is returned. 1117 /// 1118 /// \headerfile <x86intrin.h> 1119 /// 1120 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1121 /// 1122 /// \param __a 1123 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1124 /// compared to the lower double-precision value of \a __b. 1125 /// \param __b 1126 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1127 /// compared to the lower double-precision value of \a __a. 1128 /// \returns An integer containing the comparison results. If either of the two 1129 /// lower double-precision values is NaN, 0 is returned. 1130 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a, 1131 __m128d __b) { 1132 return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); 1133 } 1134 1135 /// Compares the lower double-precision floating-point values in each of 1136 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1137 /// the value in the first parameter is less than or equal to the 1138 /// corresponding value in the second parameter. 1139 /// 1140 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1141 /// double-precision values is NaN, 0 is returned. 1142 /// 1143 /// \headerfile <x86intrin.h> 1144 /// 1145 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1146 /// 1147 /// \param __a 1148 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1149 /// compared to the lower double-precision value of \a __b. 1150 /// \param __b 1151 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1152 /// compared to the lower double-precision value of \a __a. 1153 /// \returns An integer containing the comparison results. If either of the two 1154 /// lower double-precision values is NaN, 0 is returned. 1155 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a, 1156 __m128d __b) { 1157 return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); 1158 } 1159 1160 /// Compares the lower double-precision floating-point values in each of 1161 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1162 /// the value in the first parameter is greater than the corresponding value 1163 /// in the second parameter. 1164 /// 1165 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1166 /// double-precision values is NaN, 0 is returned. 1167 /// 1168 /// \headerfile <x86intrin.h> 1169 /// 1170 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1171 /// 1172 /// \param __a 1173 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1174 /// compared to the lower double-precision value of \a __b. 1175 /// \param __b 1176 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1177 /// compared to the lower double-precision value of \a __a. 1178 /// \returns An integer containing the comparison results. If either of the two 1179 /// lower double-precision values is NaN, 0 is returned. 1180 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a, 1181 __m128d __b) { 1182 return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); 1183 } 1184 1185 /// Compares the lower double-precision floating-point values in each of 1186 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1187 /// the value in the first parameter is greater than or equal to the 1188 /// corresponding value in the second parameter. 1189 /// 1190 /// The comparison yields 0 for false, 1 for true. If either of the two 1191 /// lower double-precision values is NaN, 0 is returned. 1192 /// 1193 /// \headerfile <x86intrin.h> 1194 /// 1195 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1196 /// 1197 /// \param __a 1198 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1199 /// compared to the lower double-precision value of \a __b. 1200 /// \param __b 1201 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1202 /// compared to the lower double-precision value of \a __a. 1203 /// \returns An integer containing the comparison results. If either of the two 1204 /// lower double-precision values is NaN, 0 is returned. 1205 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a, 1206 __m128d __b) { 1207 return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); 1208 } 1209 1210 /// Compares the lower double-precision floating-point values in each of 1211 /// the two 128-bit floating-point vectors of [2 x double] to determine if 1212 /// the value in the first parameter is unequal to the corresponding value in 1213 /// the second parameter. 1214 /// 1215 /// The comparison yields 0 for false, 1 for true. If either of the two lower 1216 /// double-precision values is NaN, 1 is returned. 1217 /// 1218 /// \headerfile <x86intrin.h> 1219 /// 1220 /// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction. 1221 /// 1222 /// \param __a 1223 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1224 /// compared to the lower double-precision value of \a __b. 1225 /// \param __b 1226 /// A 128-bit vector of [2 x double]. The lower double-precision value is 1227 /// compared to the lower double-precision value of \a __a. 1228 /// \returns An integer containing the comparison result. If either of the two 1229 /// lower double-precision values is NaN, 1 is returned. 1230 static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a, 1231 __m128d __b) { 1232 return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); 1233 } 1234 1235 /// Converts the two double-precision floating-point elements of a 1236 /// 128-bit vector of [2 x double] into two single-precision floating-point 1237 /// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. 1238 /// The upper 64 bits of the result vector are set to zero. 1239 /// 1240 /// \headerfile <x86intrin.h> 1241 /// 1242 /// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction. 1243 /// 1244 /// \param __a 1245 /// A 128-bit vector of [2 x double]. 1246 /// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the 1247 /// converted values. The upper 64 bits are set to zero. 1248 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) { 1249 return __builtin_ia32_cvtpd2ps((__v2df)__a); 1250 } 1251 1252 /// Converts the lower two single-precision floating-point elements of a 1253 /// 128-bit vector of [4 x float] into two double-precision floating-point 1254 /// values, returned in a 128-bit vector of [2 x double]. The upper two 1255 /// elements of the input vector are unused. 1256 /// 1257 /// \headerfile <x86intrin.h> 1258 /// 1259 /// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction. 1260 /// 1261 /// \param __a 1262 /// A 128-bit vector of [4 x float]. The lower two single-precision 1263 /// floating-point elements are converted to double-precision values. The 1264 /// upper two elements are unused. 1265 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1266 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) { 1267 return (__m128d) __builtin_convertvector( 1268 __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); 1269 } 1270 1271 /// Converts the lower two integer elements of a 128-bit vector of 1272 /// [4 x i32] into two double-precision floating-point values, returned in a 1273 /// 128-bit vector of [2 x double]. 1274 /// 1275 /// The upper two elements of the input vector are unused. 1276 /// 1277 /// \headerfile <x86intrin.h> 1278 /// 1279 /// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction. 1280 /// 1281 /// \param __a 1282 /// A 128-bit integer vector of [4 x i32]. The lower two integer elements are 1283 /// converted to double-precision values. 1284 /// 1285 /// The upper two elements are unused. 1286 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1287 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) { 1288 return (__m128d) __builtin_convertvector( 1289 __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); 1290 } 1291 1292 /// Converts the two double-precision floating-point elements of a 1293 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1294 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper 1295 /// 64 bits of the result vector are set to zero. 1296 /// 1297 /// \headerfile <x86intrin.h> 1298 /// 1299 /// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction. 1300 /// 1301 /// \param __a 1302 /// A 128-bit vector of [2 x double]. 1303 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1304 /// converted values. The upper 64 bits are set to zero. 1305 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) { 1306 return __builtin_ia32_cvtpd2dq((__v2df)__a); 1307 } 1308 1309 /// Converts the low-order element of a 128-bit vector of [2 x double] 1310 /// into a 32-bit signed integer value. 1311 /// 1312 /// \headerfile <x86intrin.h> 1313 /// 1314 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 1315 /// 1316 /// \param __a 1317 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1318 /// conversion. 1319 /// \returns A 32-bit signed integer containing the converted value. 1320 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) { 1321 return __builtin_ia32_cvtsd2si((__v2df)__a); 1322 } 1323 1324 /// Converts the lower double-precision floating-point element of a 1325 /// 128-bit vector of [2 x double], in the second parameter, into a 1326 /// single-precision floating-point value, returned in the lower 32 bits of a 1327 /// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are 1328 /// copied from the upper 96 bits of the first parameter. 1329 /// 1330 /// \headerfile <x86intrin.h> 1331 /// 1332 /// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction. 1333 /// 1334 /// \param __a 1335 /// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are 1336 /// copied to the upper 96 bits of the result. 1337 /// \param __b 1338 /// A 128-bit vector of [2 x double]. The lower double-precision 1339 /// floating-point element is used in the conversion. 1340 /// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the 1341 /// converted value from the second parameter. The upper 96 bits are copied 1342 /// from the upper 96 bits of the first parameter. 1343 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a, 1344 __m128d __b) { 1345 return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); 1346 } 1347 1348 /// Converts a 32-bit signed integer value, in the second parameter, into 1349 /// a double-precision floating-point value, returned in the lower 64 bits of 1350 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1351 /// are copied from the upper 64 bits of the first parameter. 1352 /// 1353 /// \headerfile <x86intrin.h> 1354 /// 1355 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 1356 /// 1357 /// \param __a 1358 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1359 /// copied to the upper 64 bits of the result. 1360 /// \param __b 1361 /// A 32-bit signed integer containing the value to be converted. 1362 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1363 /// converted value from the second parameter. The upper 64 bits are copied 1364 /// from the upper 64 bits of the first parameter. 1365 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a, 1366 int __b) { 1367 __a[0] = __b; 1368 return __a; 1369 } 1370 1371 /// Converts the lower single-precision floating-point element of a 1372 /// 128-bit vector of [4 x float], in the second parameter, into a 1373 /// double-precision floating-point value, returned in the lower 64 bits of 1374 /// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector 1375 /// are copied from the upper 64 bits of the first parameter. 1376 /// 1377 /// \headerfile <x86intrin.h> 1378 /// 1379 /// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction. 1380 /// 1381 /// \param __a 1382 /// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are 1383 /// copied to the upper 64 bits of the result. 1384 /// \param __b 1385 /// A 128-bit vector of [4 x float]. The lower single-precision 1386 /// floating-point element is used in the conversion. 1387 /// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the 1388 /// converted value from the second parameter. The upper 64 bits are copied 1389 /// from the upper 64 bits of the first parameter. 1390 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a, 1391 __m128 __b) { 1392 __a[0] = __b[0]; 1393 return __a; 1394 } 1395 1396 /// Converts the two double-precision floating-point elements of a 1397 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1398 /// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. 1399 /// 1400 /// If the result of either conversion is inexact, the result is truncated 1401 /// (rounded towards zero) regardless of the current MXCSR setting. The upper 1402 /// 64 bits of the result vector are set to zero. 1403 /// 1404 /// \headerfile <x86intrin.h> 1405 /// 1406 /// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c> 1407 /// instruction. 1408 /// 1409 /// \param __a 1410 /// A 128-bit vector of [2 x double]. 1411 /// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the 1412 /// converted values. The upper 64 bits are set to zero. 1413 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) { 1414 return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); 1415 } 1416 1417 /// Converts the low-order element of a [2 x double] vector into a 32-bit 1418 /// signed integer value, truncating the result when it is inexact. 1419 /// 1420 /// \headerfile <x86intrin.h> 1421 /// 1422 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 1423 /// instruction. 1424 /// 1425 /// \param __a 1426 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 1427 /// conversion. 1428 /// \returns A 32-bit signed integer containing the converted value. 1429 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) { 1430 return __builtin_ia32_cvttsd2si((__v2df)__a); 1431 } 1432 1433 /// Converts the two double-precision floating-point elements of a 1434 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1435 /// returned in a 64-bit vector of [2 x i32]. 1436 /// 1437 /// \headerfile <x86intrin.h> 1438 /// 1439 /// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction. 1440 /// 1441 /// \param __a 1442 /// A 128-bit vector of [2 x double]. 1443 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1444 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) { 1445 return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); 1446 } 1447 1448 /// Converts the two double-precision floating-point elements of a 1449 /// 128-bit vector of [2 x double] into two signed 32-bit integer values, 1450 /// returned in a 64-bit vector of [2 x i32]. 1451 /// 1452 /// If the result of either conversion is inexact, the result is truncated 1453 /// (rounded towards zero) regardless of the current MXCSR setting. 1454 /// 1455 /// \headerfile <x86intrin.h> 1456 /// 1457 /// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction. 1458 /// 1459 /// \param __a 1460 /// A 128-bit vector of [2 x double]. 1461 /// \returns A 64-bit vector of [2 x i32] containing the converted values. 1462 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) { 1463 return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); 1464 } 1465 1466 /// Converts the two signed 32-bit integer elements of a 64-bit vector of 1467 /// [2 x i32] into two double-precision floating-point values, returned in a 1468 /// 128-bit vector of [2 x double]. 1469 /// 1470 /// \headerfile <x86intrin.h> 1471 /// 1472 /// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction. 1473 /// 1474 /// \param __a 1475 /// A 64-bit vector of [2 x i32]. 1476 /// \returns A 128-bit vector of [2 x double] containing the converted values. 1477 static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) { 1478 return __builtin_ia32_cvtpi2pd((__v2si)__a); 1479 } 1480 1481 /// Returns the low-order element of a 128-bit vector of [2 x double] as 1482 /// a double-precision floating-point value. 1483 /// 1484 /// \headerfile <x86intrin.h> 1485 /// 1486 /// This intrinsic has no corresponding instruction. 1487 /// 1488 /// \param __a 1489 /// A 128-bit vector of [2 x double]. The lower 64 bits are returned. 1490 /// \returns A double-precision floating-point value copied from the lower 64 1491 /// bits of \a __a. 1492 static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) { 1493 return __a[0]; 1494 } 1495 1496 /// Loads a 128-bit floating-point vector of [2 x double] from an aligned 1497 /// memory location. 1498 /// 1499 /// \headerfile <x86intrin.h> 1500 /// 1501 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction. 1502 /// 1503 /// \param __dp 1504 /// A pointer to a 128-bit memory location. The address of the memory 1505 /// location has to be 16-byte aligned. 1506 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1507 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) { 1508 return *(const __m128d *)__dp; 1509 } 1510 1511 /// Loads a double-precision floating-point value from a specified memory 1512 /// location and duplicates it to both vector elements of a 128-bit vector of 1513 /// [2 x double]. 1514 /// 1515 /// \headerfile <x86intrin.h> 1516 /// 1517 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction. 1518 /// 1519 /// \param __dp 1520 /// A pointer to a memory location containing a double-precision value. 1521 /// \returns A 128-bit vector of [2 x double] containing the loaded and 1522 /// duplicated values. 1523 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) { 1524 struct __mm_load1_pd_struct { 1525 double __u; 1526 } __attribute__((__packed__, __may_alias__)); 1527 double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u; 1528 return __extension__(__m128d){__u, __u}; 1529 } 1530 1531 #define _mm_load_pd1(dp) _mm_load1_pd(dp) 1532 1533 /// Loads two double-precision values, in reverse order, from an aligned 1534 /// memory location into a 128-bit vector of [2 x double]. 1535 /// 1536 /// \headerfile <x86intrin.h> 1537 /// 1538 /// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction + 1539 /// needed shuffling instructions. In AVX mode, the shuffling may be combined 1540 /// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. 1541 /// 1542 /// \param __dp 1543 /// A 16-byte aligned pointer to an array of double-precision values to be 1544 /// loaded in reverse order. 1545 /// \returns A 128-bit vector of [2 x double] containing the reversed loaded 1546 /// values. 1547 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) { 1548 __m128d __u = *(const __m128d *)__dp; 1549 return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); 1550 } 1551 1552 /// Loads a 128-bit floating-point vector of [2 x double] from an 1553 /// unaligned memory location. 1554 /// 1555 /// \headerfile <x86intrin.h> 1556 /// 1557 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1558 /// 1559 /// \param __dp 1560 /// A pointer to a 128-bit memory location. The address of the memory 1561 /// location does not have to be aligned. 1562 /// \returns A 128-bit vector of [2 x double] containing the loaded values. 1563 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) { 1564 struct __loadu_pd { 1565 __m128d_u __v; 1566 } __attribute__((__packed__, __may_alias__)); 1567 return ((const struct __loadu_pd *)__dp)->__v; 1568 } 1569 1570 /// Loads a 64-bit integer value to the low element of a 128-bit integer 1571 /// vector and clears the upper element. 1572 /// 1573 /// \headerfile <x86intrin.h> 1574 /// 1575 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1576 /// 1577 /// \param __a 1578 /// A pointer to a 64-bit memory location. The address of the memory 1579 /// location does not have to be aligned. 1580 /// \returns A 128-bit vector of [2 x i64] containing the loaded value. 1581 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) { 1582 struct __loadu_si64 { 1583 long long __v; 1584 } __attribute__((__packed__, __may_alias__)); 1585 long long __u = ((const struct __loadu_si64 *)__a)->__v; 1586 return __extension__(__m128i)(__v2di){__u, 0LL}; 1587 } 1588 1589 /// Loads a 32-bit integer value to the low element of a 128-bit integer 1590 /// vector and clears the upper element. 1591 /// 1592 /// \headerfile <x86intrin.h> 1593 /// 1594 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 1595 /// 1596 /// \param __a 1597 /// A pointer to a 32-bit memory location. The address of the memory 1598 /// location does not have to be aligned. 1599 /// \returns A 128-bit vector of [4 x i32] containing the loaded value. 1600 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) { 1601 struct __loadu_si32 { 1602 int __v; 1603 } __attribute__((__packed__, __may_alias__)); 1604 int __u = ((const struct __loadu_si32 *)__a)->__v; 1605 return __extension__(__m128i)(__v4si){__u, 0, 0, 0}; 1606 } 1607 1608 /// Loads a 16-bit integer value to the low element of a 128-bit integer 1609 /// vector and clears the upper element. 1610 /// 1611 /// \headerfile <x86intrin.h> 1612 /// 1613 /// This intrinsic does not correspond to a specific instruction. 1614 /// 1615 /// \param __a 1616 /// A pointer to a 16-bit memory location. The address of the memory 1617 /// location does not have to be aligned. 1618 /// \returns A 128-bit vector of [8 x i16] containing the loaded value. 1619 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) { 1620 struct __loadu_si16 { 1621 short __v; 1622 } __attribute__((__packed__, __may_alias__)); 1623 short __u = ((const struct __loadu_si16 *)__a)->__v; 1624 return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; 1625 } 1626 1627 /// Loads a 64-bit double-precision value to the low element of a 1628 /// 128-bit integer vector and clears the upper element. 1629 /// 1630 /// \headerfile <x86intrin.h> 1631 /// 1632 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1633 /// 1634 /// \param __dp 1635 /// A pointer to a memory location containing a double-precision value. 1636 /// The address of the memory location does not have to be aligned. 1637 /// \returns A 128-bit vector of [2 x double] containing the loaded value. 1638 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) { 1639 struct __mm_load_sd_struct { 1640 double __u; 1641 } __attribute__((__packed__, __may_alias__)); 1642 double __u = ((const struct __mm_load_sd_struct *)__dp)->__u; 1643 return __extension__(__m128d){__u, 0}; 1644 } 1645 1646 /// Loads a double-precision value into the high-order bits of a 128-bit 1647 /// vector of [2 x double]. The low-order bits are copied from the low-order 1648 /// bits of the first operand. 1649 /// 1650 /// \headerfile <x86intrin.h> 1651 /// 1652 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1653 /// 1654 /// \param __a 1655 /// A 128-bit vector of [2 x double]. \n 1656 /// Bits [63:0] are written to bits [63:0] of the result. 1657 /// \param __dp 1658 /// A pointer to a 64-bit memory location containing a double-precision 1659 /// floating-point value that is loaded. The loaded value is written to bits 1660 /// [127:64] of the result. The address of the memory location does not have 1661 /// to be aligned. 1662 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1663 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a, 1664 double const *__dp) { 1665 struct __mm_loadh_pd_struct { 1666 double __u; 1667 } __attribute__((__packed__, __may_alias__)); 1668 double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u; 1669 return __extension__(__m128d){__a[0], __u}; 1670 } 1671 1672 /// Loads a double-precision value into the low-order bits of a 128-bit 1673 /// vector of [2 x double]. The high-order bits are copied from the 1674 /// high-order bits of the first operand. 1675 /// 1676 /// \headerfile <x86intrin.h> 1677 /// 1678 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1679 /// 1680 /// \param __a 1681 /// A 128-bit vector of [2 x double]. \n 1682 /// Bits [127:64] are written to bits [127:64] of the result. 1683 /// \param __dp 1684 /// A pointer to a 64-bit memory location containing a double-precision 1685 /// floating-point value that is loaded. The loaded value is written to bits 1686 /// [63:0] of the result. The address of the memory location does not have to 1687 /// be aligned. 1688 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1689 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a, 1690 double const *__dp) { 1691 struct __mm_loadl_pd_struct { 1692 double __u; 1693 } __attribute__((__packed__, __may_alias__)); 1694 double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u; 1695 return __extension__(__m128d){__u, __a[1]}; 1696 } 1697 1698 /// Constructs a 128-bit floating-point vector of [2 x double] with 1699 /// unspecified content. This could be used as an argument to another 1700 /// intrinsic function where the argument is required but the value is not 1701 /// actually used. 1702 /// 1703 /// \headerfile <x86intrin.h> 1704 /// 1705 /// This intrinsic has no corresponding instruction. 1706 /// 1707 /// \returns A 128-bit floating-point vector of [2 x double] with unspecified 1708 /// content. 1709 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) { 1710 return (__m128d)__builtin_ia32_undef128(); 1711 } 1712 1713 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1714 /// 64 bits of the vector are initialized with the specified double-precision 1715 /// floating-point value. The upper 64 bits are set to zero. 1716 /// 1717 /// \headerfile <x86intrin.h> 1718 /// 1719 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 1720 /// 1721 /// \param __w 1722 /// A double-precision floating-point value used to initialize the lower 64 1723 /// bits of the result. 1724 /// \returns An initialized 128-bit floating-point vector of [2 x double]. The 1725 /// lower 64 bits contain the value of the parameter. The upper 64 bits are 1726 /// set to zero. 1727 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) { 1728 return __extension__(__m128d){__w, 0}; 1729 } 1730 1731 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1732 /// of the two double-precision floating-point vector elements set to the 1733 /// specified double-precision floating-point value. 1734 /// 1735 /// \headerfile <x86intrin.h> 1736 /// 1737 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1738 /// 1739 /// \param __w 1740 /// A double-precision floating-point value used to initialize each vector 1741 /// element of the result. 1742 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1743 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) { 1744 return __extension__(__m128d){__w, __w}; 1745 } 1746 1747 /// Constructs a 128-bit floating-point vector of [2 x double], with each 1748 /// of the two double-precision floating-point vector elements set to the 1749 /// specified double-precision floating-point value. 1750 /// 1751 /// \headerfile <x86intrin.h> 1752 /// 1753 /// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction. 1754 /// 1755 /// \param __w 1756 /// A double-precision floating-point value used to initialize each vector 1757 /// element of the result. 1758 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1759 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) { 1760 return _mm_set1_pd(__w); 1761 } 1762 1763 /// Constructs a 128-bit floating-point vector of [2 x double] 1764 /// initialized with the specified double-precision floating-point values. 1765 /// 1766 /// \headerfile <x86intrin.h> 1767 /// 1768 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1769 /// 1770 /// \param __w 1771 /// A double-precision floating-point value used to initialize the upper 64 1772 /// bits of the result. 1773 /// \param __x 1774 /// A double-precision floating-point value used to initialize the lower 64 1775 /// bits of the result. 1776 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1777 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w, 1778 double __x) { 1779 return __extension__(__m128d){__x, __w}; 1780 } 1781 1782 /// Constructs a 128-bit floating-point vector of [2 x double], 1783 /// initialized in reverse order with the specified double-precision 1784 /// floating-point values. 1785 /// 1786 /// \headerfile <x86intrin.h> 1787 /// 1788 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 1789 /// 1790 /// \param __w 1791 /// A double-precision floating-point value used to initialize the lower 64 1792 /// bits of the result. 1793 /// \param __x 1794 /// A double-precision floating-point value used to initialize the upper 64 1795 /// bits of the result. 1796 /// \returns An initialized 128-bit floating-point vector of [2 x double]. 1797 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w, 1798 double __x) { 1799 return __extension__(__m128d){__w, __x}; 1800 } 1801 1802 /// Constructs a 128-bit floating-point vector of [2 x double] 1803 /// initialized to zero. 1804 /// 1805 /// \headerfile <x86intrin.h> 1806 /// 1807 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 1808 /// 1809 /// \returns An initialized 128-bit floating-point vector of [2 x double] with 1810 /// all elements set to zero. 1811 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) { 1812 return __extension__(__m128d){0, 0}; 1813 } 1814 1815 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower 1816 /// 64 bits are set to the lower 64 bits of the second parameter. The upper 1817 /// 64 bits are set to the upper 64 bits of the first parameter. 1818 /// 1819 /// \headerfile <x86intrin.h> 1820 /// 1821 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 1822 /// 1823 /// \param __a 1824 /// A 128-bit vector of [2 x double]. The upper 64 bits are written to the 1825 /// upper 64 bits of the result. 1826 /// \param __b 1827 /// A 128-bit vector of [2 x double]. The lower 64 bits are written to the 1828 /// lower 64 bits of the result. 1829 /// \returns A 128-bit vector of [2 x double] containing the moved values. 1830 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a, 1831 __m128d __b) { 1832 __a[0] = __b[0]; 1833 return __a; 1834 } 1835 1836 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1837 /// memory location. 1838 /// 1839 /// \headerfile <x86intrin.h> 1840 /// 1841 /// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction. 1842 /// 1843 /// \param __dp 1844 /// A pointer to a 64-bit memory location. 1845 /// \param __a 1846 /// A 128-bit vector of [2 x double] containing the value to be stored. 1847 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp, 1848 __m128d __a) { 1849 struct __mm_store_sd_struct { 1850 double __u; 1851 } __attribute__((__packed__, __may_alias__)); 1852 ((struct __mm_store_sd_struct *)__dp)->__u = __a[0]; 1853 } 1854 1855 /// Moves packed double-precision values from a 128-bit vector of 1856 /// [2 x double] to a memory location. 1857 /// 1858 /// \headerfile <x86intrin.h> 1859 /// 1860 /// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction. 1861 /// 1862 /// \param __dp 1863 /// A pointer to an aligned memory location that can store two 1864 /// double-precision values. 1865 /// \param __a 1866 /// A packed 128-bit vector of [2 x double] containing the values to be 1867 /// moved. 1868 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp, 1869 __m128d __a) { 1870 *(__m128d *)__dp = __a; 1871 } 1872 1873 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1874 /// the upper and lower 64 bits of a memory location. 1875 /// 1876 /// \headerfile <x86intrin.h> 1877 /// 1878 /// This intrinsic corresponds to the 1879 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1880 /// 1881 /// \param __dp 1882 /// A pointer to a memory location that can store two double-precision 1883 /// values. 1884 /// \param __a 1885 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1886 /// of the values in \a __dp. 1887 static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp, 1888 __m128d __a) { 1889 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 1890 _mm_store_pd(__dp, __a); 1891 } 1892 1893 /// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to 1894 /// the upper and lower 64 bits of a memory location. 1895 /// 1896 /// \headerfile <x86intrin.h> 1897 /// 1898 /// This intrinsic corresponds to the 1899 /// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction. 1900 /// 1901 /// \param __dp 1902 /// A pointer to a memory location that can store two double-precision 1903 /// values. 1904 /// \param __a 1905 /// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each 1906 /// of the values in \a __dp. 1907 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp, 1908 __m128d __a) { 1909 _mm_store1_pd(__dp, __a); 1910 } 1911 1912 /// Stores a 128-bit vector of [2 x double] into an unaligned memory 1913 /// location. 1914 /// 1915 /// \headerfile <x86intrin.h> 1916 /// 1917 /// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction. 1918 /// 1919 /// \param __dp 1920 /// A pointer to a 128-bit memory location. The address of the memory 1921 /// location does not have to be aligned. 1922 /// \param __a 1923 /// A 128-bit vector of [2 x double] containing the values to be stored. 1924 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp, 1925 __m128d __a) { 1926 struct __storeu_pd { 1927 __m128d_u __v; 1928 } __attribute__((__packed__, __may_alias__)); 1929 ((struct __storeu_pd *)__dp)->__v = __a; 1930 } 1931 1932 /// Stores two double-precision values, in reverse order, from a 128-bit 1933 /// vector of [2 x double] to a 16-byte aligned memory location. 1934 /// 1935 /// \headerfile <x86intrin.h> 1936 /// 1937 /// This intrinsic corresponds to a shuffling instruction followed by a 1938 /// <c> VMOVAPD / MOVAPD </c> instruction. 1939 /// 1940 /// \param __dp 1941 /// A pointer to a 16-byte aligned memory location that can store two 1942 /// double-precision values. 1943 /// \param __a 1944 /// A 128-bit vector of [2 x double] containing the values to be reversed and 1945 /// stored. 1946 static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp, 1947 __m128d __a) { 1948 __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); 1949 *(__m128d *)__dp = __a; 1950 } 1951 1952 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a 1953 /// memory location. 1954 /// 1955 /// \headerfile <x86intrin.h> 1956 /// 1957 /// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction. 1958 /// 1959 /// \param __dp 1960 /// A pointer to a 64-bit memory location. 1961 /// \param __a 1962 /// A 128-bit vector of [2 x double] containing the value to be stored. 1963 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp, 1964 __m128d __a) { 1965 struct __mm_storeh_pd_struct { 1966 double __u; 1967 } __attribute__((__packed__, __may_alias__)); 1968 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1]; 1969 } 1970 1971 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a 1972 /// memory location. 1973 /// 1974 /// \headerfile <x86intrin.h> 1975 /// 1976 /// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction. 1977 /// 1978 /// \param __dp 1979 /// A pointer to a 64-bit memory location. 1980 /// \param __a 1981 /// A 128-bit vector of [2 x double] containing the value to be stored. 1982 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp, 1983 __m128d __a) { 1984 struct __mm_storeh_pd_struct { 1985 double __u; 1986 } __attribute__((__packed__, __may_alias__)); 1987 ((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0]; 1988 } 1989 1990 /// Adds the corresponding elements of two 128-bit vectors of [16 x i8], 1991 /// saving the lower 8 bits of each sum in the corresponding element of a 1992 /// 128-bit result vector of [16 x i8]. 1993 /// 1994 /// The integer elements of both parameters can be either signed or unsigned. 1995 /// 1996 /// \headerfile <x86intrin.h> 1997 /// 1998 /// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction. 1999 /// 2000 /// \param __a 2001 /// A 128-bit vector of [16 x i8]. 2002 /// \param __b 2003 /// A 128-bit vector of [16 x i8]. 2004 /// \returns A 128-bit vector of [16 x i8] containing the sums of both 2005 /// parameters. 2006 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a, 2007 __m128i __b) { 2008 return (__m128i)((__v16qu)__a + (__v16qu)__b); 2009 } 2010 2011 /// Adds the corresponding elements of two 128-bit vectors of [8 x i16], 2012 /// saving the lower 16 bits of each sum in the corresponding element of a 2013 /// 128-bit result vector of [8 x i16]. 2014 /// 2015 /// The integer elements of both parameters can be either signed or unsigned. 2016 /// 2017 /// \headerfile <x86intrin.h> 2018 /// 2019 /// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction. 2020 /// 2021 /// \param __a 2022 /// A 128-bit vector of [8 x i16]. 2023 /// \param __b 2024 /// A 128-bit vector of [8 x i16]. 2025 /// \returns A 128-bit vector of [8 x i16] containing the sums of both 2026 /// parameters. 2027 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a, 2028 __m128i __b) { 2029 return (__m128i)((__v8hu)__a + (__v8hu)__b); 2030 } 2031 2032 /// Adds the corresponding elements of two 128-bit vectors of [4 x i32], 2033 /// saving the lower 32 bits of each sum in the corresponding element of a 2034 /// 128-bit result vector of [4 x i32]. 2035 /// 2036 /// The integer elements of both parameters can be either signed or unsigned. 2037 /// 2038 /// \headerfile <x86intrin.h> 2039 /// 2040 /// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction. 2041 /// 2042 /// \param __a 2043 /// A 128-bit vector of [4 x i32]. 2044 /// \param __b 2045 /// A 128-bit vector of [4 x i32]. 2046 /// \returns A 128-bit vector of [4 x i32] containing the sums of both 2047 /// parameters. 2048 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a, 2049 __m128i __b) { 2050 return (__m128i)((__v4su)__a + (__v4su)__b); 2051 } 2052 2053 /// Adds two signed or unsigned 64-bit integer values, returning the 2054 /// lower 64 bits of the sum. 2055 /// 2056 /// \headerfile <x86intrin.h> 2057 /// 2058 /// This intrinsic corresponds to the <c> PADDQ </c> instruction. 2059 /// 2060 /// \param __a 2061 /// A 64-bit integer. 2062 /// \param __b 2063 /// A 64-bit integer. 2064 /// \returns A 64-bit integer containing the sum of both parameters. 2065 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a, 2066 __m64 __b) { 2067 return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); 2068 } 2069 2070 /// Adds the corresponding elements of two 128-bit vectors of [2 x i64], 2071 /// saving the lower 64 bits of each sum in the corresponding element of a 2072 /// 128-bit result vector of [2 x i64]. 2073 /// 2074 /// The integer elements of both parameters can be either signed or unsigned. 2075 /// 2076 /// \headerfile <x86intrin.h> 2077 /// 2078 /// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction. 2079 /// 2080 /// \param __a 2081 /// A 128-bit vector of [2 x i64]. 2082 /// \param __b 2083 /// A 128-bit vector of [2 x i64]. 2084 /// \returns A 128-bit vector of [2 x i64] containing the sums of both 2085 /// parameters. 2086 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a, 2087 __m128i __b) { 2088 return (__m128i)((__v2du)__a + (__v2du)__b); 2089 } 2090 2091 /// Adds, with saturation, the corresponding elements of two 128-bit 2092 /// signed [16 x i8] vectors, saving each sum in the corresponding element of 2093 /// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are 2094 /// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. 2095 /// 2096 /// \headerfile <x86intrin.h> 2097 /// 2098 /// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction. 2099 /// 2100 /// \param __a 2101 /// A 128-bit signed [16 x i8] vector. 2102 /// \param __b 2103 /// A 128-bit signed [16 x i8] vector. 2104 /// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of 2105 /// both parameters. 2106 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a, 2107 __m128i __b) { 2108 return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); 2109 } 2110 2111 /// Adds, with saturation, the corresponding elements of two 128-bit 2112 /// signed [8 x i16] vectors, saving each sum in the corresponding element of 2113 /// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF 2114 /// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to 2115 /// 0x8000. 2116 /// 2117 /// \headerfile <x86intrin.h> 2118 /// 2119 /// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction. 2120 /// 2121 /// \param __a 2122 /// A 128-bit signed [8 x i16] vector. 2123 /// \param __b 2124 /// A 128-bit signed [8 x i16] vector. 2125 /// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of 2126 /// both parameters. 2127 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a, 2128 __m128i __b) { 2129 return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); 2130 } 2131 2132 /// Adds, with saturation, the corresponding elements of two 128-bit 2133 /// unsigned [16 x i8] vectors, saving each sum in the corresponding element 2134 /// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF 2135 /// are saturated to 0xFF. Negative sums are saturated to 0x00. 2136 /// 2137 /// \headerfile <x86intrin.h> 2138 /// 2139 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2140 /// 2141 /// \param __a 2142 /// A 128-bit unsigned [16 x i8] vector. 2143 /// \param __b 2144 /// A 128-bit unsigned [16 x i8] vector. 2145 /// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums 2146 /// of both parameters. 2147 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a, 2148 __m128i __b) { 2149 return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); 2150 } 2151 2152 /// Adds, with saturation, the corresponding elements of two 128-bit 2153 /// unsigned [8 x i16] vectors, saving each sum in the corresponding element 2154 /// of a 128-bit result vector of [8 x i16]. Positive sums greater than 2155 /// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. 2156 /// 2157 /// \headerfile <x86intrin.h> 2158 /// 2159 /// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction. 2160 /// 2161 /// \param __a 2162 /// A 128-bit unsigned [8 x i16] vector. 2163 /// \param __b 2164 /// A 128-bit unsigned [8 x i16] vector. 2165 /// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums 2166 /// of both parameters. 2167 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a, 2168 __m128i __b) { 2169 return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); 2170 } 2171 2172 /// Computes the rounded averages of corresponding elements of two 2173 /// 128-bit unsigned [16 x i8] vectors, saving each result in the 2174 /// corresponding element of a 128-bit result vector of [16 x i8]. 2175 /// 2176 /// \headerfile <x86intrin.h> 2177 /// 2178 /// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction. 2179 /// 2180 /// \param __a 2181 /// A 128-bit unsigned [16 x i8] vector. 2182 /// \param __b 2183 /// A 128-bit unsigned [16 x i8] vector. 2184 /// \returns A 128-bit unsigned [16 x i8] vector containing the rounded 2185 /// averages of both parameters. 2186 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, 2187 __m128i __b) { 2188 return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); 2189 } 2190 2191 /// Computes the rounded averages of corresponding elements of two 2192 /// 128-bit unsigned [8 x i16] vectors, saving each result in the 2193 /// corresponding element of a 128-bit result vector of [8 x i16]. 2194 /// 2195 /// \headerfile <x86intrin.h> 2196 /// 2197 /// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction. 2198 /// 2199 /// \param __a 2200 /// A 128-bit unsigned [8 x i16] vector. 2201 /// \param __b 2202 /// A 128-bit unsigned [8 x i16] vector. 2203 /// \returns A 128-bit unsigned [8 x i16] vector containing the rounded 2204 /// averages of both parameters. 2205 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, 2206 __m128i __b) { 2207 return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); 2208 } 2209 2210 /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] 2211 /// vectors, producing eight intermediate 32-bit signed integer products, and 2212 /// adds the consecutive pairs of 32-bit products to form a 128-bit signed 2213 /// [4 x i32] vector. 2214 /// 2215 /// For example, bits [15:0] of both parameters are multiplied producing a 2216 /// 32-bit product, bits [31:16] of both parameters are multiplied producing 2217 /// a 32-bit product, and the sum of those two products becomes bits [31:0] 2218 /// of the result. 2219 /// 2220 /// \headerfile <x86intrin.h> 2221 /// 2222 /// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction. 2223 /// 2224 /// \param __a 2225 /// A 128-bit signed [8 x i16] vector. 2226 /// \param __b 2227 /// A 128-bit signed [8 x i16] vector. 2228 /// \returns A 128-bit signed [4 x i32] vector containing the sums of products 2229 /// of both parameters. 2230 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a, 2231 __m128i __b) { 2232 return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); 2233 } 2234 2235 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2236 /// vectors, saving the greater value from each comparison in the 2237 /// corresponding element of a 128-bit result vector of [8 x i16]. 2238 /// 2239 /// \headerfile <x86intrin.h> 2240 /// 2241 /// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction. 2242 /// 2243 /// \param __a 2244 /// A 128-bit signed [8 x i16] vector. 2245 /// \param __b 2246 /// A 128-bit signed [8 x i16] vector. 2247 /// \returns A 128-bit signed [8 x i16] vector containing the greater value of 2248 /// each comparison. 2249 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a, 2250 __m128i __b) { 2251 return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); 2252 } 2253 2254 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2255 /// vectors, saving the greater value from each comparison in the 2256 /// corresponding element of a 128-bit result vector of [16 x i8]. 2257 /// 2258 /// \headerfile <x86intrin.h> 2259 /// 2260 /// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction. 2261 /// 2262 /// \param __a 2263 /// A 128-bit unsigned [16 x i8] vector. 2264 /// \param __b 2265 /// A 128-bit unsigned [16 x i8] vector. 2266 /// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of 2267 /// each comparison. 2268 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a, 2269 __m128i __b) { 2270 return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); 2271 } 2272 2273 /// Compares corresponding elements of two 128-bit signed [8 x i16] 2274 /// vectors, saving the smaller value from each comparison in the 2275 /// corresponding element of a 128-bit result vector of [8 x i16]. 2276 /// 2277 /// \headerfile <x86intrin.h> 2278 /// 2279 /// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction. 2280 /// 2281 /// \param __a 2282 /// A 128-bit signed [8 x i16] vector. 2283 /// \param __b 2284 /// A 128-bit signed [8 x i16] vector. 2285 /// \returns A 128-bit signed [8 x i16] vector containing the smaller value of 2286 /// each comparison. 2287 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a, 2288 __m128i __b) { 2289 return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); 2290 } 2291 2292 /// Compares corresponding elements of two 128-bit unsigned [16 x i8] 2293 /// vectors, saving the smaller value from each comparison in the 2294 /// corresponding element of a 128-bit result vector of [16 x i8]. 2295 /// 2296 /// \headerfile <x86intrin.h> 2297 /// 2298 /// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction. 2299 /// 2300 /// \param __a 2301 /// A 128-bit unsigned [16 x i8] vector. 2302 /// \param __b 2303 /// A 128-bit unsigned [16 x i8] vector. 2304 /// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of 2305 /// each comparison. 2306 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, 2307 __m128i __b) { 2308 return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); 2309 } 2310 2311 /// Multiplies the corresponding elements of two signed [8 x i16] 2312 /// vectors, saving the upper 16 bits of each 32-bit product in the 2313 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2314 /// 2315 /// \headerfile <x86intrin.h> 2316 /// 2317 /// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction. 2318 /// 2319 /// \param __a 2320 /// A 128-bit signed [8 x i16] vector. 2321 /// \param __b 2322 /// A 128-bit signed [8 x i16] vector. 2323 /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of 2324 /// each of the eight 32-bit products. 2325 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, 2326 __m128i __b) { 2327 return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); 2328 } 2329 2330 /// Multiplies the corresponding elements of two unsigned [8 x i16] 2331 /// vectors, saving the upper 16 bits of each 32-bit product in the 2332 /// corresponding element of a 128-bit unsigned [8 x i16] result vector. 2333 /// 2334 /// \headerfile <x86intrin.h> 2335 /// 2336 /// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction. 2337 /// 2338 /// \param __a 2339 /// A 128-bit unsigned [8 x i16] vector. 2340 /// \param __b 2341 /// A 128-bit unsigned [8 x i16] vector. 2342 /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits 2343 /// of each of the eight 32-bit products. 2344 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, 2345 __m128i __b) { 2346 return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); 2347 } 2348 2349 /// Multiplies the corresponding elements of two signed [8 x i16] 2350 /// vectors, saving the lower 16 bits of each 32-bit product in the 2351 /// corresponding element of a 128-bit signed [8 x i16] result vector. 2352 /// 2353 /// \headerfile <x86intrin.h> 2354 /// 2355 /// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction. 2356 /// 2357 /// \param __a 2358 /// A 128-bit signed [8 x i16] vector. 2359 /// \param __b 2360 /// A 128-bit signed [8 x i16] vector. 2361 /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of 2362 /// each of the eight 32-bit products. 2363 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, 2364 __m128i __b) { 2365 return (__m128i)((__v8hu)__a * (__v8hu)__b); 2366 } 2367 2368 /// Multiplies 32-bit unsigned integer values contained in the lower bits 2369 /// of the two 64-bit integer vectors and returns the 64-bit unsigned 2370 /// product. 2371 /// 2372 /// \headerfile <x86intrin.h> 2373 /// 2374 /// This intrinsic corresponds to the <c> PMULUDQ </c> instruction. 2375 /// 2376 /// \param __a 2377 /// A 64-bit integer containing one of the source operands. 2378 /// \param __b 2379 /// A 64-bit integer containing one of the source operands. 2380 /// \returns A 64-bit integer vector containing the product of both operands. 2381 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a, 2382 __m64 __b) { 2383 return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); 2384 } 2385 2386 /// Multiplies 32-bit unsigned integer values contained in the lower 2387 /// bits of the corresponding elements of two [2 x i64] vectors, and returns 2388 /// the 64-bit products in the corresponding elements of a [2 x i64] vector. 2389 /// 2390 /// \headerfile <x86intrin.h> 2391 /// 2392 /// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction. 2393 /// 2394 /// \param __a 2395 /// A [2 x i64] vector containing one of the source operands. 2396 /// \param __b 2397 /// A [2 x i64] vector containing one of the source operands. 2398 /// \returns A [2 x i64] vector containing the product of both operands. 2399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, 2400 __m128i __b) { 2401 return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); 2402 } 2403 2404 /// Computes the absolute differences of corresponding 8-bit integer 2405 /// values in two 128-bit vectors. Sums the first 8 absolute differences, and 2406 /// separately sums the second 8 absolute differences. Packs these two 2407 /// unsigned 16-bit integer sums into the upper and lower elements of a 2408 /// [2 x i64] vector. 2409 /// 2410 /// \headerfile <x86intrin.h> 2411 /// 2412 /// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction. 2413 /// 2414 /// \param __a 2415 /// A 128-bit integer vector containing one of the source operands. 2416 /// \param __b 2417 /// A 128-bit integer vector containing one of the source operands. 2418 /// \returns A [2 x i64] vector containing the sums of the sets of absolute 2419 /// differences between both operands. 2420 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a, 2421 __m128i __b) { 2422 return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); 2423 } 2424 2425 /// Subtracts the corresponding 8-bit integer values in the operands. 2426 /// 2427 /// \headerfile <x86intrin.h> 2428 /// 2429 /// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction. 2430 /// 2431 /// \param __a 2432 /// A 128-bit integer vector containing the minuends. 2433 /// \param __b 2434 /// A 128-bit integer vector containing the subtrahends. 2435 /// \returns A 128-bit integer vector containing the differences of the values 2436 /// in the operands. 2437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a, 2438 __m128i __b) { 2439 return (__m128i)((__v16qu)__a - (__v16qu)__b); 2440 } 2441 2442 /// Subtracts the corresponding 16-bit integer values in the operands. 2443 /// 2444 /// \headerfile <x86intrin.h> 2445 /// 2446 /// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction. 2447 /// 2448 /// \param __a 2449 /// A 128-bit integer vector containing the minuends. 2450 /// \param __b 2451 /// A 128-bit integer vector containing the subtrahends. 2452 /// \returns A 128-bit integer vector containing the differences of the values 2453 /// in the operands. 2454 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a, 2455 __m128i __b) { 2456 return (__m128i)((__v8hu)__a - (__v8hu)__b); 2457 } 2458 2459 /// Subtracts the corresponding 32-bit integer values in the operands. 2460 /// 2461 /// \headerfile <x86intrin.h> 2462 /// 2463 /// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction. 2464 /// 2465 /// \param __a 2466 /// A 128-bit integer vector containing the minuends. 2467 /// \param __b 2468 /// A 128-bit integer vector containing the subtrahends. 2469 /// \returns A 128-bit integer vector containing the differences of the values 2470 /// in the operands. 2471 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a, 2472 __m128i __b) { 2473 return (__m128i)((__v4su)__a - (__v4su)__b); 2474 } 2475 2476 /// Subtracts signed or unsigned 64-bit integer values and writes the 2477 /// difference to the corresponding bits in the destination. 2478 /// 2479 /// \headerfile <x86intrin.h> 2480 /// 2481 /// This intrinsic corresponds to the <c> PSUBQ </c> instruction. 2482 /// 2483 /// \param __a 2484 /// A 64-bit integer vector containing the minuend. 2485 /// \param __b 2486 /// A 64-bit integer vector containing the subtrahend. 2487 /// \returns A 64-bit integer vector containing the difference of the values in 2488 /// the operands. 2489 static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a, 2490 __m64 __b) { 2491 return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); 2492 } 2493 2494 /// Subtracts the corresponding elements of two [2 x i64] vectors. 2495 /// 2496 /// \headerfile <x86intrin.h> 2497 /// 2498 /// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction. 2499 /// 2500 /// \param __a 2501 /// A 128-bit integer vector containing the minuends. 2502 /// \param __b 2503 /// A 128-bit integer vector containing the subtrahends. 2504 /// \returns A 128-bit integer vector containing the differences of the values 2505 /// in the operands. 2506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a, 2507 __m128i __b) { 2508 return (__m128i)((__v2du)__a - (__v2du)__b); 2509 } 2510 2511 /// Subtracts corresponding 8-bit signed integer values in the input and 2512 /// returns the differences in the corresponding bytes in the destination. 2513 /// Differences greater than 0x7F are saturated to 0x7F, and differences less 2514 /// than 0x80 are saturated to 0x80. 2515 /// 2516 /// \headerfile <x86intrin.h> 2517 /// 2518 /// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction. 2519 /// 2520 /// \param __a 2521 /// A 128-bit integer vector containing the minuends. 2522 /// \param __b 2523 /// A 128-bit integer vector containing the subtrahends. 2524 /// \returns A 128-bit integer vector containing the differences of the values 2525 /// in the operands. 2526 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a, 2527 __m128i __b) { 2528 return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); 2529 } 2530 2531 /// Subtracts corresponding 16-bit signed integer values in the input and 2532 /// returns the differences in the corresponding bytes in the destination. 2533 /// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less 2534 /// than 0x8000 are saturated to 0x8000. 2535 /// 2536 /// \headerfile <x86intrin.h> 2537 /// 2538 /// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction. 2539 /// 2540 /// \param __a 2541 /// A 128-bit integer vector containing the minuends. 2542 /// \param __b 2543 /// A 128-bit integer vector containing the subtrahends. 2544 /// \returns A 128-bit integer vector containing the differences of the values 2545 /// in the operands. 2546 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a, 2547 __m128i __b) { 2548 return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); 2549 } 2550 2551 /// Subtracts corresponding 8-bit unsigned integer values in the input 2552 /// and returns the differences in the corresponding bytes in the 2553 /// destination. Differences less than 0x00 are saturated to 0x00. 2554 /// 2555 /// \headerfile <x86intrin.h> 2556 /// 2557 /// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction. 2558 /// 2559 /// \param __a 2560 /// A 128-bit integer vector containing the minuends. 2561 /// \param __b 2562 /// A 128-bit integer vector containing the subtrahends. 2563 /// \returns A 128-bit integer vector containing the unsigned integer 2564 /// differences of the values in the operands. 2565 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a, 2566 __m128i __b) { 2567 return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); 2568 } 2569 2570 /// Subtracts corresponding 16-bit unsigned integer values in the input 2571 /// and returns the differences in the corresponding bytes in the 2572 /// destination. Differences less than 0x0000 are saturated to 0x0000. 2573 /// 2574 /// \headerfile <x86intrin.h> 2575 /// 2576 /// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction. 2577 /// 2578 /// \param __a 2579 /// A 128-bit integer vector containing the minuends. 2580 /// \param __b 2581 /// A 128-bit integer vector containing the subtrahends. 2582 /// \returns A 128-bit integer vector containing the unsigned integer 2583 /// differences of the values in the operands. 2584 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a, 2585 __m128i __b) { 2586 return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); 2587 } 2588 2589 /// Performs a bitwise AND of two 128-bit integer vectors. 2590 /// 2591 /// \headerfile <x86intrin.h> 2592 /// 2593 /// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction. 2594 /// 2595 /// \param __a 2596 /// A 128-bit integer vector containing one of the source operands. 2597 /// \param __b 2598 /// A 128-bit integer vector containing one of the source operands. 2599 /// \returns A 128-bit integer vector containing the bitwise AND of the values 2600 /// in both operands. 2601 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a, 2602 __m128i __b) { 2603 return (__m128i)((__v2du)__a & (__v2du)__b); 2604 } 2605 2606 /// Performs a bitwise AND of two 128-bit integer vectors, using the 2607 /// one's complement of the values contained in the first source operand. 2608 /// 2609 /// \headerfile <x86intrin.h> 2610 /// 2611 /// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction. 2612 /// 2613 /// \param __a 2614 /// A 128-bit vector containing the left source operand. The one's complement 2615 /// of this value is used in the bitwise AND. 2616 /// \param __b 2617 /// A 128-bit vector containing the right source operand. 2618 /// \returns A 128-bit integer vector containing the bitwise AND of the one's 2619 /// complement of the first operand and the values in the second operand. 2620 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a, 2621 __m128i __b) { 2622 return (__m128i)(~(__v2du)__a & (__v2du)__b); 2623 } 2624 /// Performs a bitwise OR of two 128-bit integer vectors. 2625 /// 2626 /// \headerfile <x86intrin.h> 2627 /// 2628 /// This intrinsic corresponds to the <c> VPOR / POR </c> instruction. 2629 /// 2630 /// \param __a 2631 /// A 128-bit integer vector containing one of the source operands. 2632 /// \param __b 2633 /// A 128-bit integer vector containing one of the source operands. 2634 /// \returns A 128-bit integer vector containing the bitwise OR of the values 2635 /// in both operands. 2636 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a, 2637 __m128i __b) { 2638 return (__m128i)((__v2du)__a | (__v2du)__b); 2639 } 2640 2641 /// Performs a bitwise exclusive OR of two 128-bit integer vectors. 2642 /// 2643 /// \headerfile <x86intrin.h> 2644 /// 2645 /// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction. 2646 /// 2647 /// \param __a 2648 /// A 128-bit integer vector containing one of the source operands. 2649 /// \param __b 2650 /// A 128-bit integer vector containing one of the source operands. 2651 /// \returns A 128-bit integer vector containing the bitwise exclusive OR of the 2652 /// values in both operands. 2653 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a, 2654 __m128i __b) { 2655 return (__m128i)((__v2du)__a ^ (__v2du)__b); 2656 } 2657 2658 /// Left-shifts the 128-bit integer vector operand by the specified 2659 /// number of bytes. Low-order bits are cleared. 2660 /// 2661 /// \headerfile <x86intrin.h> 2662 /// 2663 /// \code 2664 /// __m128i _mm_slli_si128(__m128i a, const int imm); 2665 /// \endcode 2666 /// 2667 /// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction. 2668 /// 2669 /// \param a 2670 /// A 128-bit integer vector containing the source operand. 2671 /// \param imm 2672 /// An immediate value specifying the number of bytes to left-shift operand 2673 /// \a a. 2674 /// \returns A 128-bit integer vector containing the left-shifted value. 2675 #define _mm_slli_si128(a, imm) \ 2676 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2677 (int)(imm))) 2678 2679 #define _mm_bslli_si128(a, imm) \ 2680 ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ 2681 (int)(imm))) 2682 2683 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2684 /// by the specified number of bits. Low-order bits are cleared. 2685 /// 2686 /// \headerfile <x86intrin.h> 2687 /// 2688 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2689 /// 2690 /// \param __a 2691 /// A 128-bit integer vector containing the source operand. 2692 /// \param __count 2693 /// An integer value specifying the number of bits to left-shift each value 2694 /// in operand \a __a. 2695 /// \returns A 128-bit integer vector containing the left-shifted values. 2696 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a, 2697 int __count) { 2698 return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); 2699 } 2700 2701 /// Left-shifts each 16-bit value in the 128-bit integer vector operand 2702 /// by the specified number of bits. Low-order bits are cleared. 2703 /// 2704 /// \headerfile <x86intrin.h> 2705 /// 2706 /// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction. 2707 /// 2708 /// \param __a 2709 /// A 128-bit integer vector containing the source operand. 2710 /// \param __count 2711 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2712 /// to left-shift each value in operand \a __a. 2713 /// \returns A 128-bit integer vector containing the left-shifted values. 2714 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a, 2715 __m128i __count) { 2716 return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); 2717 } 2718 2719 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2720 /// by the specified number of bits. Low-order bits are cleared. 2721 /// 2722 /// \headerfile <x86intrin.h> 2723 /// 2724 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2725 /// 2726 /// \param __a 2727 /// A 128-bit integer vector containing the source operand. 2728 /// \param __count 2729 /// An integer value specifying the number of bits to left-shift each value 2730 /// in operand \a __a. 2731 /// \returns A 128-bit integer vector containing the left-shifted values. 2732 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a, 2733 int __count) { 2734 return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); 2735 } 2736 2737 /// Left-shifts each 32-bit value in the 128-bit integer vector operand 2738 /// by the specified number of bits. Low-order bits are cleared. 2739 /// 2740 /// \headerfile <x86intrin.h> 2741 /// 2742 /// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction. 2743 /// 2744 /// \param __a 2745 /// A 128-bit integer vector containing the source operand. 2746 /// \param __count 2747 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2748 /// to left-shift each value in operand \a __a. 2749 /// \returns A 128-bit integer vector containing the left-shifted values. 2750 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a, 2751 __m128i __count) { 2752 return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); 2753 } 2754 2755 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2756 /// by the specified number of bits. Low-order bits are cleared. 2757 /// 2758 /// \headerfile <x86intrin.h> 2759 /// 2760 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2761 /// 2762 /// \param __a 2763 /// A 128-bit integer vector containing the source operand. 2764 /// \param __count 2765 /// An integer value specifying the number of bits to left-shift each value 2766 /// in operand \a __a. 2767 /// \returns A 128-bit integer vector containing the left-shifted values. 2768 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a, 2769 int __count) { 2770 return __builtin_ia32_psllqi128((__v2di)__a, __count); 2771 } 2772 2773 /// Left-shifts each 64-bit value in the 128-bit integer vector operand 2774 /// by the specified number of bits. Low-order bits are cleared. 2775 /// 2776 /// \headerfile <x86intrin.h> 2777 /// 2778 /// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction. 2779 /// 2780 /// \param __a 2781 /// A 128-bit integer vector containing the source operand. 2782 /// \param __count 2783 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2784 /// to left-shift each value in operand \a __a. 2785 /// \returns A 128-bit integer vector containing the left-shifted values. 2786 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a, 2787 __m128i __count) { 2788 return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); 2789 } 2790 2791 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2792 /// by the specified number of bits. High-order bits are filled with the sign 2793 /// bit of the initial value. 2794 /// 2795 /// \headerfile <x86intrin.h> 2796 /// 2797 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2798 /// 2799 /// \param __a 2800 /// A 128-bit integer vector containing the source operand. 2801 /// \param __count 2802 /// An integer value specifying the number of bits to right-shift each value 2803 /// in operand \a __a. 2804 /// \returns A 128-bit integer vector containing the right-shifted values. 2805 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a, 2806 int __count) { 2807 return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); 2808 } 2809 2810 /// Right-shifts each 16-bit value in the 128-bit integer vector operand 2811 /// by the specified number of bits. High-order bits are filled with the sign 2812 /// bit of the initial value. 2813 /// 2814 /// \headerfile <x86intrin.h> 2815 /// 2816 /// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction. 2817 /// 2818 /// \param __a 2819 /// A 128-bit integer vector containing the source operand. 2820 /// \param __count 2821 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2822 /// to right-shift each value in operand \a __a. 2823 /// \returns A 128-bit integer vector containing the right-shifted values. 2824 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a, 2825 __m128i __count) { 2826 return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); 2827 } 2828 2829 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2830 /// by the specified number of bits. High-order bits are filled with the sign 2831 /// bit of the initial value. 2832 /// 2833 /// \headerfile <x86intrin.h> 2834 /// 2835 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2836 /// 2837 /// \param __a 2838 /// A 128-bit integer vector containing the source operand. 2839 /// \param __count 2840 /// An integer value specifying the number of bits to right-shift each value 2841 /// in operand \a __a. 2842 /// \returns A 128-bit integer vector containing the right-shifted values. 2843 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a, 2844 int __count) { 2845 return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); 2846 } 2847 2848 /// Right-shifts each 32-bit value in the 128-bit integer vector operand 2849 /// by the specified number of bits. High-order bits are filled with the sign 2850 /// bit of the initial value. 2851 /// 2852 /// \headerfile <x86intrin.h> 2853 /// 2854 /// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction. 2855 /// 2856 /// \param __a 2857 /// A 128-bit integer vector containing the source operand. 2858 /// \param __count 2859 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2860 /// to right-shift each value in operand \a __a. 2861 /// \returns A 128-bit integer vector containing the right-shifted values. 2862 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, 2863 __m128i __count) { 2864 return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); 2865 } 2866 2867 /// Right-shifts the 128-bit integer vector operand by the specified 2868 /// number of bytes. High-order bits are cleared. 2869 /// 2870 /// \headerfile <x86intrin.h> 2871 /// 2872 /// \code 2873 /// __m128i _mm_srli_si128(__m128i a, const int imm); 2874 /// \endcode 2875 /// 2876 /// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction. 2877 /// 2878 /// \param a 2879 /// A 128-bit integer vector containing the source operand. 2880 /// \param imm 2881 /// An immediate value specifying the number of bytes to right-shift operand 2882 /// \a a. 2883 /// \returns A 128-bit integer vector containing the right-shifted value. 2884 #define _mm_srli_si128(a, imm) \ 2885 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2886 (int)(imm))) 2887 2888 #define _mm_bsrli_si128(a, imm) \ 2889 ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ 2890 (int)(imm))) 2891 2892 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2893 /// operand by the specified number of bits. High-order bits are cleared. 2894 /// 2895 /// \headerfile <x86intrin.h> 2896 /// 2897 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2898 /// 2899 /// \param __a 2900 /// A 128-bit integer vector containing the source operand. 2901 /// \param __count 2902 /// An integer value specifying the number of bits to right-shift each value 2903 /// in operand \a __a. 2904 /// \returns A 128-bit integer vector containing the right-shifted values. 2905 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a, 2906 int __count) { 2907 return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); 2908 } 2909 2910 /// Right-shifts each of 16-bit values in the 128-bit integer vector 2911 /// operand by the specified number of bits. High-order bits are cleared. 2912 /// 2913 /// \headerfile <x86intrin.h> 2914 /// 2915 /// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction. 2916 /// 2917 /// \param __a 2918 /// A 128-bit integer vector containing the source operand. 2919 /// \param __count 2920 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2921 /// to right-shift each value in operand \a __a. 2922 /// \returns A 128-bit integer vector containing the right-shifted values. 2923 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a, 2924 __m128i __count) { 2925 return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); 2926 } 2927 2928 /// Right-shifts each of 32-bit values in the 128-bit integer vector 2929 /// operand by the specified number of bits. High-order bits are cleared. 2930 /// 2931 /// \headerfile <x86intrin.h> 2932 /// 2933 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2934 /// 2935 /// \param __a 2936 /// A 128-bit integer vector containing the source operand. 2937 /// \param __count 2938 /// An integer value specifying the number of bits to right-shift each value 2939 /// in operand \a __a. 2940 /// \returns A 128-bit integer vector containing the right-shifted values. 2941 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a, 2942 int __count) { 2943 return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); 2944 } 2945 2946 /// Right-shifts each of 32-bit values in the 128-bit integer vector 2947 /// operand by the specified number of bits. High-order bits are cleared. 2948 /// 2949 /// \headerfile <x86intrin.h> 2950 /// 2951 /// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction. 2952 /// 2953 /// \param __a 2954 /// A 128-bit integer vector containing the source operand. 2955 /// \param __count 2956 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2957 /// to right-shift each value in operand \a __a. 2958 /// \returns A 128-bit integer vector containing the right-shifted values. 2959 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a, 2960 __m128i __count) { 2961 return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); 2962 } 2963 2964 /// Right-shifts each of 64-bit values in the 128-bit integer vector 2965 /// operand by the specified number of bits. High-order bits are cleared. 2966 /// 2967 /// \headerfile <x86intrin.h> 2968 /// 2969 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 2970 /// 2971 /// \param __a 2972 /// A 128-bit integer vector containing the source operand. 2973 /// \param __count 2974 /// An integer value specifying the number of bits to right-shift each value 2975 /// in operand \a __a. 2976 /// \returns A 128-bit integer vector containing the right-shifted values. 2977 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a, 2978 int __count) { 2979 return __builtin_ia32_psrlqi128((__v2di)__a, __count); 2980 } 2981 2982 /// Right-shifts each of 64-bit values in the 128-bit integer vector 2983 /// operand by the specified number of bits. High-order bits are cleared. 2984 /// 2985 /// \headerfile <x86intrin.h> 2986 /// 2987 /// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction. 2988 /// 2989 /// \param __a 2990 /// A 128-bit integer vector containing the source operand. 2991 /// \param __count 2992 /// A 128-bit integer vector in which bits [63:0] specify the number of bits 2993 /// to right-shift each value in operand \a __a. 2994 /// \returns A 128-bit integer vector containing the right-shifted values. 2995 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a, 2996 __m128i __count) { 2997 return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); 2998 } 2999 3000 /// Compares each of the corresponding 8-bit values of the 128-bit 3001 /// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF 3002 /// for true. 3003 /// 3004 /// \headerfile <x86intrin.h> 3005 /// 3006 /// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction. 3007 /// 3008 /// \param __a 3009 /// A 128-bit integer vector. 3010 /// \param __b 3011 /// A 128-bit integer vector. 3012 /// \returns A 128-bit integer vector containing the comparison results. 3013 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, 3014 __m128i __b) { 3015 return (__m128i)((__v16qi)__a == (__v16qi)__b); 3016 } 3017 3018 /// Compares each of the corresponding 16-bit values of the 128-bit 3019 /// integer vectors for equality. Each comparison yields 0x0 for false, 3020 /// 0xFFFF for true. 3021 /// 3022 /// \headerfile <x86intrin.h> 3023 /// 3024 /// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction. 3025 /// 3026 /// \param __a 3027 /// A 128-bit integer vector. 3028 /// \param __b 3029 /// A 128-bit integer vector. 3030 /// \returns A 128-bit integer vector containing the comparison results. 3031 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a, 3032 __m128i __b) { 3033 return (__m128i)((__v8hi)__a == (__v8hi)__b); 3034 } 3035 3036 /// Compares each of the corresponding 32-bit values of the 128-bit 3037 /// integer vectors for equality. Each comparison yields 0x0 for false, 3038 /// 0xFFFFFFFF for true. 3039 /// 3040 /// \headerfile <x86intrin.h> 3041 /// 3042 /// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction. 3043 /// 3044 /// \param __a 3045 /// A 128-bit integer vector. 3046 /// \param __b 3047 /// A 128-bit integer vector. 3048 /// \returns A 128-bit integer vector containing the comparison results. 3049 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a, 3050 __m128i __b) { 3051 return (__m128i)((__v4si)__a == (__v4si)__b); 3052 } 3053 3054 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3055 /// integer vectors to determine if the values in the first operand are 3056 /// greater than those in the second operand. Each comparison yields 0x0 for 3057 /// false, 0xFF for true. 3058 /// 3059 /// \headerfile <x86intrin.h> 3060 /// 3061 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3062 /// 3063 /// \param __a 3064 /// A 128-bit integer vector. 3065 /// \param __b 3066 /// A 128-bit integer vector. 3067 /// \returns A 128-bit integer vector containing the comparison results. 3068 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a, 3069 __m128i __b) { 3070 /* This function always performs a signed comparison, but __v16qi is a char 3071 which may be signed or unsigned, so use __v16qs. */ 3072 return (__m128i)((__v16qs)__a > (__v16qs)__b); 3073 } 3074 3075 /// Compares each of the corresponding signed 16-bit values of the 3076 /// 128-bit integer vectors to determine if the values in the first operand 3077 /// are greater than those in the second operand. 3078 /// 3079 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3080 /// 3081 /// \headerfile <x86intrin.h> 3082 /// 3083 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3084 /// 3085 /// \param __a 3086 /// A 128-bit integer vector. 3087 /// \param __b 3088 /// A 128-bit integer vector. 3089 /// \returns A 128-bit integer vector containing the comparison results. 3090 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a, 3091 __m128i __b) { 3092 return (__m128i)((__v8hi)__a > (__v8hi)__b); 3093 } 3094 3095 /// Compares each of the corresponding signed 32-bit values of the 3096 /// 128-bit integer vectors to determine if the values in the first operand 3097 /// are greater than those in the second operand. 3098 /// 3099 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3100 /// 3101 /// \headerfile <x86intrin.h> 3102 /// 3103 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3104 /// 3105 /// \param __a 3106 /// A 128-bit integer vector. 3107 /// \param __b 3108 /// A 128-bit integer vector. 3109 /// \returns A 128-bit integer vector containing the comparison results. 3110 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a, 3111 __m128i __b) { 3112 return (__m128i)((__v4si)__a > (__v4si)__b); 3113 } 3114 3115 /// Compares each of the corresponding signed 8-bit values of the 128-bit 3116 /// integer vectors to determine if the values in the first operand are less 3117 /// than those in the second operand. 3118 /// 3119 /// Each comparison yields 0x0 for false, 0xFF for true. 3120 /// 3121 /// \headerfile <x86intrin.h> 3122 /// 3123 /// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction. 3124 /// 3125 /// \param __a 3126 /// A 128-bit integer vector. 3127 /// \param __b 3128 /// A 128-bit integer vector. 3129 /// \returns A 128-bit integer vector containing the comparison results. 3130 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a, 3131 __m128i __b) { 3132 return _mm_cmpgt_epi8(__b, __a); 3133 } 3134 3135 /// Compares each of the corresponding signed 16-bit values of the 3136 /// 128-bit integer vectors to determine if the values in the first operand 3137 /// are less than those in the second operand. 3138 /// 3139 /// Each comparison yields 0x0 for false, 0xFFFF for true. 3140 /// 3141 /// \headerfile <x86intrin.h> 3142 /// 3143 /// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction. 3144 /// 3145 /// \param __a 3146 /// A 128-bit integer vector. 3147 /// \param __b 3148 /// A 128-bit integer vector. 3149 /// \returns A 128-bit integer vector containing the comparison results. 3150 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a, 3151 __m128i __b) { 3152 return _mm_cmpgt_epi16(__b, __a); 3153 } 3154 3155 /// Compares each of the corresponding signed 32-bit values of the 3156 /// 128-bit integer vectors to determine if the values in the first operand 3157 /// are less than those in the second operand. 3158 /// 3159 /// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. 3160 /// 3161 /// \headerfile <x86intrin.h> 3162 /// 3163 /// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction. 3164 /// 3165 /// \param __a 3166 /// A 128-bit integer vector. 3167 /// \param __b 3168 /// A 128-bit integer vector. 3169 /// \returns A 128-bit integer vector containing the comparison results. 3170 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a, 3171 __m128i __b) { 3172 return _mm_cmpgt_epi32(__b, __a); 3173 } 3174 3175 #ifdef __x86_64__ 3176 /// Converts a 64-bit signed integer value from the second operand into a 3177 /// double-precision value and returns it in the lower element of a [2 x 3178 /// double] vector; the upper element of the returned vector is copied from 3179 /// the upper element of the first operand. 3180 /// 3181 /// \headerfile <x86intrin.h> 3182 /// 3183 /// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction. 3184 /// 3185 /// \param __a 3186 /// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are 3187 /// copied to the upper 64 bits of the destination. 3188 /// \param __b 3189 /// A 64-bit signed integer operand containing the value to be converted. 3190 /// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the 3191 /// converted value of the second operand. The upper 64 bits are copied from 3192 /// the upper 64 bits of the first operand. 3193 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a, 3194 long long __b) { 3195 __a[0] = __b; 3196 return __a; 3197 } 3198 3199 /// Converts the first (lower) element of a vector of [2 x double] into a 3200 /// 64-bit signed integer value, according to the current rounding mode. 3201 /// 3202 /// \headerfile <x86intrin.h> 3203 /// 3204 /// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction. 3205 /// 3206 /// \param __a 3207 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3208 /// conversion. 3209 /// \returns A 64-bit signed integer containing the converted value. 3210 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) { 3211 return __builtin_ia32_cvtsd2si64((__v2df)__a); 3212 } 3213 3214 /// Converts the first (lower) element of a vector of [2 x double] into a 3215 /// 64-bit signed integer value, truncating the result when it is inexact. 3216 /// 3217 /// \headerfile <x86intrin.h> 3218 /// 3219 /// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c> 3220 /// instruction. 3221 /// 3222 /// \param __a 3223 /// A 128-bit vector of [2 x double]. The lower 64 bits are used in the 3224 /// conversion. 3225 /// \returns A 64-bit signed integer containing the converted value. 3226 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) { 3227 return __builtin_ia32_cvttsd2si64((__v2df)__a); 3228 } 3229 #endif 3230 3231 /// Converts a vector of [4 x i32] into a vector of [4 x float]. 3232 /// 3233 /// \headerfile <x86intrin.h> 3234 /// 3235 /// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction. 3236 /// 3237 /// \param __a 3238 /// A 128-bit integer vector. 3239 /// \returns A 128-bit vector of [4 x float] containing the converted values. 3240 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) { 3241 return (__m128) __builtin_convertvector((__v4si)__a, __v4sf); 3242 } 3243 3244 /// Converts a vector of [4 x float] into a vector of [4 x i32]. 3245 /// 3246 /// \headerfile <x86intrin.h> 3247 /// 3248 /// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction. 3249 /// 3250 /// \param __a 3251 /// A 128-bit vector of [4 x float]. 3252 /// \returns A 128-bit integer vector of [4 x i32] containing the converted 3253 /// values. 3254 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) { 3255 return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); 3256 } 3257 3258 /// Converts a vector of [4 x float] into a vector of [4 x i32], 3259 /// truncating the result when it is inexact. 3260 /// 3261 /// \headerfile <x86intrin.h> 3262 /// 3263 /// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c> 3264 /// instruction. 3265 /// 3266 /// \param __a 3267 /// A 128-bit vector of [4 x float]. 3268 /// \returns A 128-bit vector of [4 x i32] containing the converted values. 3269 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) { 3270 return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); 3271 } 3272 3273 /// Returns a vector of [4 x i32] where the lowest element is the input 3274 /// operand and the remaining elements are zero. 3275 /// 3276 /// \headerfile <x86intrin.h> 3277 /// 3278 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3279 /// 3280 /// \param __a 3281 /// A 32-bit signed integer operand. 3282 /// \returns A 128-bit vector of [4 x i32]. 3283 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) { 3284 return __extension__(__m128i)(__v4si){__a, 0, 0, 0}; 3285 } 3286 3287 /// Returns a vector of [2 x i64] where the lower element is the input 3288 /// operand and the upper element is zero. 3289 /// 3290 /// \headerfile <x86intrin.h> 3291 /// 3292 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction 3293 /// in 64-bit mode. 3294 /// 3295 /// \param __a 3296 /// A 64-bit signed integer operand containing the value to be converted. 3297 /// \returns A 128-bit vector of [2 x i64] containing the converted value. 3298 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) { 3299 return __extension__(__m128i)(__v2di){__a, 0}; 3300 } 3301 3302 /// Moves the least significant 32 bits of a vector of [4 x i32] to a 3303 /// 32-bit signed integer value. 3304 /// 3305 /// \headerfile <x86intrin.h> 3306 /// 3307 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3308 /// 3309 /// \param __a 3310 /// A vector of [4 x i32]. The least significant 32 bits are moved to the 3311 /// destination. 3312 /// \returns A 32-bit signed integer containing the moved value. 3313 static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) { 3314 __v4si __b = (__v4si)__a; 3315 return __b[0]; 3316 } 3317 3318 /// Moves the least significant 64 bits of a vector of [2 x i64] to a 3319 /// 64-bit signed integer value. 3320 /// 3321 /// \headerfile <x86intrin.h> 3322 /// 3323 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3324 /// 3325 /// \param __a 3326 /// A vector of [2 x i64]. The least significant 64 bits are moved to the 3327 /// destination. 3328 /// \returns A 64-bit signed integer containing the moved value. 3329 static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) { 3330 return __a[0]; 3331 } 3332 3333 /// Moves packed integer values from an aligned 128-bit memory location 3334 /// to elements in a 128-bit integer vector. 3335 /// 3336 /// \headerfile <x86intrin.h> 3337 /// 3338 /// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction. 3339 /// 3340 /// \param __p 3341 /// An aligned pointer to a memory location containing integer values. 3342 /// \returns A 128-bit integer vector containing the moved values. 3343 static __inline__ __m128i __DEFAULT_FN_ATTRS 3344 _mm_load_si128(__m128i const *__p) { 3345 return *__p; 3346 } 3347 3348 /// Moves packed integer values from an unaligned 128-bit memory location 3349 /// to elements in a 128-bit integer vector. 3350 /// 3351 /// \headerfile <x86intrin.h> 3352 /// 3353 /// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction. 3354 /// 3355 /// \param __p 3356 /// A pointer to a memory location containing integer values. 3357 /// \returns A 128-bit integer vector containing the moved values. 3358 static __inline__ __m128i __DEFAULT_FN_ATTRS 3359 _mm_loadu_si128(__m128i_u const *__p) { 3360 struct __loadu_si128 { 3361 __m128i_u __v; 3362 } __attribute__((__packed__, __may_alias__)); 3363 return ((const struct __loadu_si128 *)__p)->__v; 3364 } 3365 3366 /// Returns a vector of [2 x i64] where the lower element is taken from 3367 /// the lower element of the operand, and the upper element is zero. 3368 /// 3369 /// \headerfile <x86intrin.h> 3370 /// 3371 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3372 /// 3373 /// \param __p 3374 /// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of 3375 /// the destination. 3376 /// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the 3377 /// moved value. The higher order bits are cleared. 3378 static __inline__ __m128i __DEFAULT_FN_ATTRS 3379 _mm_loadl_epi64(__m128i_u const *__p) { 3380 struct __mm_loadl_epi64_struct { 3381 long long __u; 3382 } __attribute__((__packed__, __may_alias__)); 3383 return __extension__(__m128i){ 3384 ((const struct __mm_loadl_epi64_struct *)__p)->__u, 0}; 3385 } 3386 3387 /// Generates a 128-bit vector of [4 x i32] with unspecified content. 3388 /// This could be used as an argument to another intrinsic function where the 3389 /// argument is required but the value is not actually used. 3390 /// 3391 /// \headerfile <x86intrin.h> 3392 /// 3393 /// This intrinsic has no corresponding instruction. 3394 /// 3395 /// \returns A 128-bit vector of [4 x i32] with unspecified content. 3396 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) { 3397 return (__m128i)__builtin_ia32_undef128(); 3398 } 3399 3400 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3401 /// the specified 64-bit integer values. 3402 /// 3403 /// \headerfile <x86intrin.h> 3404 /// 3405 /// This intrinsic is a utility function and does not correspond to a specific 3406 /// instruction. 3407 /// 3408 /// \param __q1 3409 /// A 64-bit integer value used to initialize the upper 64 bits of the 3410 /// destination vector of [2 x i64]. 3411 /// \param __q0 3412 /// A 64-bit integer value used to initialize the lower 64 bits of the 3413 /// destination vector of [2 x i64]. 3414 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3415 /// provided in the operands. 3416 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1, 3417 long long __q0) { 3418 return __extension__(__m128i)(__v2di){__q0, __q1}; 3419 } 3420 3421 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with 3422 /// the specified 64-bit integer values. 3423 /// 3424 /// \headerfile <x86intrin.h> 3425 /// 3426 /// This intrinsic is a utility function and does not correspond to a specific 3427 /// instruction. 3428 /// 3429 /// \param __q1 3430 /// A 64-bit integer value used to initialize the upper 64 bits of the 3431 /// destination vector of [2 x i64]. 3432 /// \param __q0 3433 /// A 64-bit integer value used to initialize the lower 64 bits of the 3434 /// destination vector of [2 x i64]. 3435 /// \returns An initialized 128-bit vector of [2 x i64] containing the values 3436 /// provided in the operands. 3437 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1, 3438 __m64 __q0) { 3439 return _mm_set_epi64x((long long)__q1, (long long)__q0); 3440 } 3441 3442 /// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with 3443 /// the specified 32-bit integer values. 3444 /// 3445 /// \headerfile <x86intrin.h> 3446 /// 3447 /// This intrinsic is a utility function and does not correspond to a specific 3448 /// instruction. 3449 /// 3450 /// \param __i3 3451 /// A 32-bit integer value used to initialize bits [127:96] of the 3452 /// destination vector. 3453 /// \param __i2 3454 /// A 32-bit integer value used to initialize bits [95:64] of the destination 3455 /// vector. 3456 /// \param __i1 3457 /// A 32-bit integer value used to initialize bits [63:32] of the destination 3458 /// vector. 3459 /// \param __i0 3460 /// A 32-bit integer value used to initialize bits [31:0] of the destination 3461 /// vector. 3462 /// \returns An initialized 128-bit vector of [4 x i32] containing the values 3463 /// provided in the operands. 3464 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2, 3465 int __i1, int __i0) { 3466 return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3}; 3467 } 3468 3469 /// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with 3470 /// the specified 16-bit integer values. 3471 /// 3472 /// \headerfile <x86intrin.h> 3473 /// 3474 /// This intrinsic is a utility function and does not correspond to a specific 3475 /// instruction. 3476 /// 3477 /// \param __w7 3478 /// A 16-bit integer value used to initialize bits [127:112] of the 3479 /// destination vector. 3480 /// \param __w6 3481 /// A 16-bit integer value used to initialize bits [111:96] of the 3482 /// destination vector. 3483 /// \param __w5 3484 /// A 16-bit integer value used to initialize bits [95:80] of the destination 3485 /// vector. 3486 /// \param __w4 3487 /// A 16-bit integer value used to initialize bits [79:64] of the destination 3488 /// vector. 3489 /// \param __w3 3490 /// A 16-bit integer value used to initialize bits [63:48] of the destination 3491 /// vector. 3492 /// \param __w2 3493 /// A 16-bit integer value used to initialize bits [47:32] of the destination 3494 /// vector. 3495 /// \param __w1 3496 /// A 16-bit integer value used to initialize bits [31:16] of the destination 3497 /// vector. 3498 /// \param __w0 3499 /// A 16-bit integer value used to initialize bits [15:0] of the destination 3500 /// vector. 3501 /// \returns An initialized 128-bit vector of [8 x i16] containing the values 3502 /// provided in the operands. 3503 static __inline__ __m128i __DEFAULT_FN_ATTRS 3504 _mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, 3505 short __w2, short __w1, short __w0) { 3506 return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3, 3507 __w4, __w5, __w6, __w7}; 3508 } 3509 3510 /// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with 3511 /// the specified 8-bit integer values. 3512 /// 3513 /// \headerfile <x86intrin.h> 3514 /// 3515 /// This intrinsic is a utility function and does not correspond to a specific 3516 /// instruction. 3517 /// 3518 /// \param __b15 3519 /// Initializes bits [127:120] of the destination vector. 3520 /// \param __b14 3521 /// Initializes bits [119:112] of the destination vector. 3522 /// \param __b13 3523 /// Initializes bits [111:104] of the destination vector. 3524 /// \param __b12 3525 /// Initializes bits [103:96] of the destination vector. 3526 /// \param __b11 3527 /// Initializes bits [95:88] of the destination vector. 3528 /// \param __b10 3529 /// Initializes bits [87:80] of the destination vector. 3530 /// \param __b9 3531 /// Initializes bits [79:72] of the destination vector. 3532 /// \param __b8 3533 /// Initializes bits [71:64] of the destination vector. 3534 /// \param __b7 3535 /// Initializes bits [63:56] of the destination vector. 3536 /// \param __b6 3537 /// Initializes bits [55:48] of the destination vector. 3538 /// \param __b5 3539 /// Initializes bits [47:40] of the destination vector. 3540 /// \param __b4 3541 /// Initializes bits [39:32] of the destination vector. 3542 /// \param __b3 3543 /// Initializes bits [31:24] of the destination vector. 3544 /// \param __b2 3545 /// Initializes bits [23:16] of the destination vector. 3546 /// \param __b1 3547 /// Initializes bits [15:8] of the destination vector. 3548 /// \param __b0 3549 /// Initializes bits [7:0] of the destination vector. 3550 /// \returns An initialized 128-bit vector of [16 x i8] containing the values 3551 /// provided in the operands. 3552 static __inline__ __m128i __DEFAULT_FN_ATTRS 3553 _mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, 3554 char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, 3555 char __b4, char __b3, char __b2, char __b1, char __b0) { 3556 return __extension__(__m128i)(__v16qi){ 3557 __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, 3558 __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15}; 3559 } 3560 3561 /// Initializes both values in a 128-bit integer vector with the 3562 /// specified 64-bit integer value. 3563 /// 3564 /// \headerfile <x86intrin.h> 3565 /// 3566 /// This intrinsic is a utility function and does not correspond to a specific 3567 /// instruction. 3568 /// 3569 /// \param __q 3570 /// Integer value used to initialize the elements of the destination integer 3571 /// vector. 3572 /// \returns An initialized 128-bit integer vector of [2 x i64] with both 3573 /// elements containing the value provided in the operand. 3574 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) { 3575 return _mm_set_epi64x(__q, __q); 3576 } 3577 3578 /// Initializes both values in a 128-bit vector of [2 x i64] with the 3579 /// specified 64-bit value. 3580 /// 3581 /// \headerfile <x86intrin.h> 3582 /// 3583 /// This intrinsic is a utility function and does not correspond to a specific 3584 /// instruction. 3585 /// 3586 /// \param __q 3587 /// A 64-bit value used to initialize the elements of the destination integer 3588 /// vector. 3589 /// \returns An initialized 128-bit vector of [2 x i64] with all elements 3590 /// containing the value provided in the operand. 3591 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) { 3592 return _mm_set_epi64(__q, __q); 3593 } 3594 3595 /// Initializes all values in a 128-bit vector of [4 x i32] with the 3596 /// specified 32-bit value. 3597 /// 3598 /// \headerfile <x86intrin.h> 3599 /// 3600 /// This intrinsic is a utility function and does not correspond to a specific 3601 /// instruction. 3602 /// 3603 /// \param __i 3604 /// A 32-bit value used to initialize the elements of the destination integer 3605 /// vector. 3606 /// \returns An initialized 128-bit vector of [4 x i32] with all elements 3607 /// containing the value provided in the operand. 3608 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) { 3609 return _mm_set_epi32(__i, __i, __i, __i); 3610 } 3611 3612 /// Initializes all values in a 128-bit vector of [8 x i16] with the 3613 /// specified 16-bit value. 3614 /// 3615 /// \headerfile <x86intrin.h> 3616 /// 3617 /// This intrinsic is a utility function and does not correspond to a specific 3618 /// instruction. 3619 /// 3620 /// \param __w 3621 /// A 16-bit value used to initialize the elements of the destination integer 3622 /// vector. 3623 /// \returns An initialized 128-bit vector of [8 x i16] with all elements 3624 /// containing the value provided in the operand. 3625 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) { 3626 return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); 3627 } 3628 3629 /// Initializes all values in a 128-bit vector of [16 x i8] with the 3630 /// specified 8-bit value. 3631 /// 3632 /// \headerfile <x86intrin.h> 3633 /// 3634 /// This intrinsic is a utility function and does not correspond to a specific 3635 /// instruction. 3636 /// 3637 /// \param __b 3638 /// An 8-bit value used to initialize the elements of the destination integer 3639 /// vector. 3640 /// \returns An initialized 128-bit vector of [16 x i8] with all elements 3641 /// containing the value provided in the operand. 3642 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) { 3643 return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, 3644 __b, __b, __b, __b, __b); 3645 } 3646 3647 /// Constructs a 128-bit integer vector, initialized in reverse order 3648 /// with the specified 64-bit integral values. 3649 /// 3650 /// \headerfile <x86intrin.h> 3651 /// 3652 /// This intrinsic does not correspond to a specific instruction. 3653 /// 3654 /// \param __q0 3655 /// A 64-bit integral value used to initialize the lower 64 bits of the 3656 /// result. 3657 /// \param __q1 3658 /// A 64-bit integral value used to initialize the upper 64 bits of the 3659 /// result. 3660 /// \returns An initialized 128-bit integer vector. 3661 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0, 3662 __m64 __q1) { 3663 return _mm_set_epi64(__q1, __q0); 3664 } 3665 3666 /// Constructs a 128-bit integer vector, initialized in reverse order 3667 /// with the specified 32-bit integral values. 3668 /// 3669 /// \headerfile <x86intrin.h> 3670 /// 3671 /// This intrinsic is a utility function and does not correspond to a specific 3672 /// instruction. 3673 /// 3674 /// \param __i0 3675 /// A 32-bit integral value used to initialize bits [31:0] of the result. 3676 /// \param __i1 3677 /// A 32-bit integral value used to initialize bits [63:32] of the result. 3678 /// \param __i2 3679 /// A 32-bit integral value used to initialize bits [95:64] of the result. 3680 /// \param __i3 3681 /// A 32-bit integral value used to initialize bits [127:96] of the result. 3682 /// \returns An initialized 128-bit integer vector. 3683 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1, 3684 int __i2, 3685 int __i3) { 3686 return _mm_set_epi32(__i3, __i2, __i1, __i0); 3687 } 3688 3689 /// Constructs a 128-bit integer vector, initialized in reverse order 3690 /// with the specified 16-bit integral values. 3691 /// 3692 /// \headerfile <x86intrin.h> 3693 /// 3694 /// This intrinsic is a utility function and does not correspond to a specific 3695 /// instruction. 3696 /// 3697 /// \param __w0 3698 /// A 16-bit integral value used to initialize bits [15:0] of the result. 3699 /// \param __w1 3700 /// A 16-bit integral value used to initialize bits [31:16] of the result. 3701 /// \param __w2 3702 /// A 16-bit integral value used to initialize bits [47:32] of the result. 3703 /// \param __w3 3704 /// A 16-bit integral value used to initialize bits [63:48] of the result. 3705 /// \param __w4 3706 /// A 16-bit integral value used to initialize bits [79:64] of the result. 3707 /// \param __w5 3708 /// A 16-bit integral value used to initialize bits [95:80] of the result. 3709 /// \param __w6 3710 /// A 16-bit integral value used to initialize bits [111:96] of the result. 3711 /// \param __w7 3712 /// A 16-bit integral value used to initialize bits [127:112] of the result. 3713 /// \returns An initialized 128-bit integer vector. 3714 static __inline__ __m128i __DEFAULT_FN_ATTRS 3715 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, 3716 short __w5, short __w6, short __w7) { 3717 return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); 3718 } 3719 3720 /// Constructs a 128-bit integer vector, initialized in reverse order 3721 /// with the specified 8-bit integral values. 3722 /// 3723 /// \headerfile <x86intrin.h> 3724 /// 3725 /// This intrinsic is a utility function and does not correspond to a specific 3726 /// instruction. 3727 /// 3728 /// \param __b0 3729 /// An 8-bit integral value used to initialize bits [7:0] of the result. 3730 /// \param __b1 3731 /// An 8-bit integral value used to initialize bits [15:8] of the result. 3732 /// \param __b2 3733 /// An 8-bit integral value used to initialize bits [23:16] of the result. 3734 /// \param __b3 3735 /// An 8-bit integral value used to initialize bits [31:24] of the result. 3736 /// \param __b4 3737 /// An 8-bit integral value used to initialize bits [39:32] of the result. 3738 /// \param __b5 3739 /// An 8-bit integral value used to initialize bits [47:40] of the result. 3740 /// \param __b6 3741 /// An 8-bit integral value used to initialize bits [55:48] of the result. 3742 /// \param __b7 3743 /// An 8-bit integral value used to initialize bits [63:56] of the result. 3744 /// \param __b8 3745 /// An 8-bit integral value used to initialize bits [71:64] of the result. 3746 /// \param __b9 3747 /// An 8-bit integral value used to initialize bits [79:72] of the result. 3748 /// \param __b10 3749 /// An 8-bit integral value used to initialize bits [87:80] of the result. 3750 /// \param __b11 3751 /// An 8-bit integral value used to initialize bits [95:88] of the result. 3752 /// \param __b12 3753 /// An 8-bit integral value used to initialize bits [103:96] of the result. 3754 /// \param __b13 3755 /// An 8-bit integral value used to initialize bits [111:104] of the result. 3756 /// \param __b14 3757 /// An 8-bit integral value used to initialize bits [119:112] of the result. 3758 /// \param __b15 3759 /// An 8-bit integral value used to initialize bits [127:120] of the result. 3760 /// \returns An initialized 128-bit integer vector. 3761 static __inline__ __m128i __DEFAULT_FN_ATTRS 3762 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, 3763 char __b6, char __b7, char __b8, char __b9, char __b10, 3764 char __b11, char __b12, char __b13, char __b14, char __b15) { 3765 return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, 3766 __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 3767 } 3768 3769 /// Creates a 128-bit integer vector initialized to zero. 3770 /// 3771 /// \headerfile <x86intrin.h> 3772 /// 3773 /// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction. 3774 /// 3775 /// \returns An initialized 128-bit integer vector with all elements set to 3776 /// zero. 3777 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) { 3778 return __extension__(__m128i)(__v2di){0LL, 0LL}; 3779 } 3780 3781 /// Stores a 128-bit integer vector to a memory location aligned on a 3782 /// 128-bit boundary. 3783 /// 3784 /// \headerfile <x86intrin.h> 3785 /// 3786 /// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction. 3787 /// 3788 /// \param __p 3789 /// A pointer to an aligned memory location that will receive the integer 3790 /// values. 3791 /// \param __b 3792 /// A 128-bit integer vector containing the values to be moved. 3793 static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p, 3794 __m128i __b) { 3795 *__p = __b; 3796 } 3797 3798 /// Stores a 128-bit integer vector to an unaligned memory location. 3799 /// 3800 /// \headerfile <x86intrin.h> 3801 /// 3802 /// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction. 3803 /// 3804 /// \param __p 3805 /// A pointer to a memory location that will receive the integer values. 3806 /// \param __b 3807 /// A 128-bit integer vector containing the values to be moved. 3808 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p, 3809 __m128i __b) { 3810 struct __storeu_si128 { 3811 __m128i_u __v; 3812 } __attribute__((__packed__, __may_alias__)); 3813 ((struct __storeu_si128 *)__p)->__v = __b; 3814 } 3815 3816 /// Stores a 64-bit integer value from the low element of a 128-bit integer 3817 /// vector. 3818 /// 3819 /// \headerfile <x86intrin.h> 3820 /// 3821 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 3822 /// 3823 /// \param __p 3824 /// A pointer to a 64-bit memory location. The address of the memory 3825 /// location does not have to be aligned. 3826 /// \param __b 3827 /// A 128-bit integer vector containing the value to be stored. 3828 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p, 3829 __m128i __b) { 3830 struct __storeu_si64 { 3831 long long __v; 3832 } __attribute__((__packed__, __may_alias__)); 3833 ((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0]; 3834 } 3835 3836 /// Stores a 32-bit integer value from the low element of a 128-bit integer 3837 /// vector. 3838 /// 3839 /// \headerfile <x86intrin.h> 3840 /// 3841 /// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction. 3842 /// 3843 /// \param __p 3844 /// A pointer to a 32-bit memory location. The address of the memory 3845 /// location does not have to be aligned. 3846 /// \param __b 3847 /// A 128-bit integer vector containing the value to be stored. 3848 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p, 3849 __m128i __b) { 3850 struct __storeu_si32 { 3851 int __v; 3852 } __attribute__((__packed__, __may_alias__)); 3853 ((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0]; 3854 } 3855 3856 /// Stores a 16-bit integer value from the low element of a 128-bit integer 3857 /// vector. 3858 /// 3859 /// \headerfile <x86intrin.h> 3860 /// 3861 /// This intrinsic does not correspond to a specific instruction. 3862 /// 3863 /// \param __p 3864 /// A pointer to a 16-bit memory location. The address of the memory 3865 /// location does not have to be aligned. 3866 /// \param __b 3867 /// A 128-bit integer vector containing the value to be stored. 3868 static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p, 3869 __m128i __b) { 3870 struct __storeu_si16 { 3871 short __v; 3872 } __attribute__((__packed__, __may_alias__)); 3873 ((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0]; 3874 } 3875 3876 /// Moves bytes selected by the mask from the first operand to the 3877 /// specified unaligned memory location. When a mask bit is 1, the 3878 /// corresponding byte is written, otherwise it is not written. 3879 /// 3880 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3881 /// used again soon). Exception and trap behavior for elements not selected 3882 /// for storage to memory are implementation dependent. 3883 /// 3884 /// \headerfile <x86intrin.h> 3885 /// 3886 /// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c> 3887 /// instruction. 3888 /// 3889 /// \param __d 3890 /// A 128-bit integer vector containing the values to be moved. 3891 /// \param __n 3892 /// A 128-bit integer vector containing the mask. The most significant bit of 3893 /// each byte represents the mask bits. 3894 /// \param __p 3895 /// A pointer to an unaligned 128-bit memory location where the specified 3896 /// values are moved. 3897 static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d, 3898 __m128i __n, 3899 char *__p) { 3900 __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); 3901 } 3902 3903 /// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to 3904 /// a memory location. 3905 /// 3906 /// \headerfile <x86intrin.h> 3907 /// 3908 /// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction. 3909 /// 3910 /// \param __p 3911 /// A pointer to a 64-bit memory location that will receive the lower 64 bits 3912 /// of the integer vector parameter. 3913 /// \param __a 3914 /// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the 3915 /// value to be stored. 3916 static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p, 3917 __m128i __a) { 3918 struct __mm_storel_epi64_struct { 3919 long long __u; 3920 } __attribute__((__packed__, __may_alias__)); 3921 ((struct __mm_storel_epi64_struct *)__p)->__u = __a[0]; 3922 } 3923 3924 /// Stores a 128-bit floating point vector of [2 x double] to a 128-bit 3925 /// aligned memory location. 3926 /// 3927 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3928 /// used again soon). 3929 /// 3930 /// \headerfile <x86intrin.h> 3931 /// 3932 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 3933 /// 3934 /// \param __p 3935 /// A pointer to the 128-bit aligned memory location used to store the value. 3936 /// \param __a 3937 /// A vector of [2 x double] containing the 64-bit values to be stored. 3938 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p, 3939 __m128d __a) { 3940 __builtin_nontemporal_store((__v2df)__a, (__v2df *)__p); 3941 } 3942 3943 /// Stores a 128-bit integer vector to a 128-bit aligned memory location. 3944 /// 3945 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3946 /// used again soon). 3947 /// 3948 /// \headerfile <x86intrin.h> 3949 /// 3950 /// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction. 3951 /// 3952 /// \param __p 3953 /// A pointer to the 128-bit aligned memory location used to store the value. 3954 /// \param __a 3955 /// A 128-bit integer vector containing the values to be stored. 3956 static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p, 3957 __m128i __a) { 3958 __builtin_nontemporal_store((__v2di)__a, (__v2di *)__p); 3959 } 3960 3961 /// Stores a 32-bit integer value in the specified memory location. 3962 /// 3963 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3964 /// used again soon). 3965 /// 3966 /// \headerfile <x86intrin.h> 3967 /// 3968 /// This intrinsic corresponds to the <c> MOVNTI </c> instruction. 3969 /// 3970 /// \param __p 3971 /// A pointer to the 32-bit memory location used to store the value. 3972 /// \param __a 3973 /// A 32-bit integer containing the value to be stored. 3974 static __inline__ void 3975 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 3976 _mm_stream_si32(int *__p, int __a) { 3977 __builtin_ia32_movnti(__p, __a); 3978 } 3979 3980 #ifdef __x86_64__ 3981 /// Stores a 64-bit integer value in the specified memory location. 3982 /// 3983 /// To minimize caching, the data is flagged as non-temporal (unlikely to be 3984 /// used again soon). 3985 /// 3986 /// \headerfile <x86intrin.h> 3987 /// 3988 /// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction. 3989 /// 3990 /// \param __p 3991 /// A pointer to the 64-bit memory location used to store the value. 3992 /// \param __a 3993 /// A 64-bit integer containing the value to be stored. 3994 static __inline__ void 3995 __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) 3996 _mm_stream_si64(long long *__p, long long __a) { 3997 __builtin_ia32_movnti64(__p, __a); 3998 } 3999 #endif 4000 4001 #if defined(__cplusplus) 4002 extern "C" { 4003 #endif 4004 4005 /// The cache line containing \a __p is flushed and invalidated from all 4006 /// caches in the coherency domain. 4007 /// 4008 /// \headerfile <x86intrin.h> 4009 /// 4010 /// This intrinsic corresponds to the <c> CLFLUSH </c> instruction. 4011 /// 4012 /// \param __p 4013 /// A pointer to the memory location used to identify the cache line to be 4014 /// flushed. 4015 void _mm_clflush(void const *__p); 4016 4017 /// Forces strong memory ordering (serialization) between load 4018 /// instructions preceding this instruction and load instructions following 4019 /// this instruction, ensuring the system completes all previous loads before 4020 /// executing subsequent loads. 4021 /// 4022 /// \headerfile <x86intrin.h> 4023 /// 4024 /// This intrinsic corresponds to the <c> LFENCE </c> instruction. 4025 /// 4026 void _mm_lfence(void); 4027 4028 /// Forces strong memory ordering (serialization) between load and store 4029 /// instructions preceding this instruction and load and store instructions 4030 /// following this instruction, ensuring that the system completes all 4031 /// previous memory accesses before executing subsequent memory accesses. 4032 /// 4033 /// \headerfile <x86intrin.h> 4034 /// 4035 /// This intrinsic corresponds to the <c> MFENCE </c> instruction. 4036 /// 4037 void _mm_mfence(void); 4038 4039 #if defined(__cplusplus) 4040 } // extern "C" 4041 #endif 4042 4043 /// Converts 16-bit signed integers from both 128-bit integer vector 4044 /// operands into 8-bit signed integers, and packs the results into the 4045 /// destination. Positive values greater than 0x7F are saturated to 0x7F. 4046 /// Negative values less than 0x80 are saturated to 0x80. 4047 /// 4048 /// \headerfile <x86intrin.h> 4049 /// 4050 /// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction. 4051 /// 4052 /// \param __a 4053 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4054 /// a signed integer and is converted to a 8-bit signed integer with 4055 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4056 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4057 /// written to the lower 64 bits of the result. 4058 /// \param __b 4059 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4060 /// a signed integer and is converted to a 8-bit signed integer with 4061 /// saturation. Values greater than 0x7F are saturated to 0x7F. Values less 4062 /// than 0x80 are saturated to 0x80. The converted [8 x i8] values are 4063 /// written to the higher 64 bits of the result. 4064 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4065 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a, 4066 __m128i __b) { 4067 return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); 4068 } 4069 4070 /// Converts 32-bit signed integers from both 128-bit integer vector 4071 /// operands into 16-bit signed integers, and packs the results into the 4072 /// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. 4073 /// Negative values less than 0x8000 are saturated to 0x8000. 4074 /// 4075 /// \headerfile <x86intrin.h> 4076 /// 4077 /// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction. 4078 /// 4079 /// \param __a 4080 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4081 /// a signed integer and is converted to a 16-bit signed integer with 4082 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4083 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4084 /// are written to the lower 64 bits of the result. 4085 /// \param __b 4086 /// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as 4087 /// a signed integer and is converted to a 16-bit signed integer with 4088 /// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values 4089 /// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values 4090 /// are written to the higher 64 bits of the result. 4091 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 4092 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a, 4093 __m128i __b) { 4094 return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); 4095 } 4096 4097 /// Converts 16-bit signed integers from both 128-bit integer vector 4098 /// operands into 8-bit unsigned integers, and packs the results into the 4099 /// destination. Values greater than 0xFF are saturated to 0xFF. Values less 4100 /// than 0x00 are saturated to 0x00. 4101 /// 4102 /// \headerfile <x86intrin.h> 4103 /// 4104 /// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction. 4105 /// 4106 /// \param __a 4107 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4108 /// a signed integer and is converted to an 8-bit unsigned integer with 4109 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4110 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4111 /// written to the lower 64 bits of the result. 4112 /// \param __b 4113 /// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as 4114 /// a signed integer and is converted to an 8-bit unsigned integer with 4115 /// saturation. Values greater than 0xFF are saturated to 0xFF. Values less 4116 /// than 0x00 are saturated to 0x00. The converted [8 x i8] values are 4117 /// written to the higher 64 bits of the result. 4118 /// \returns A 128-bit vector of [16 x i8] containing the converted values. 4119 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a, 4120 __m128i __b) { 4121 return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); 4122 } 4123 4124 /// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using 4125 /// the immediate-value parameter as a selector. 4126 /// 4127 /// \headerfile <x86intrin.h> 4128 /// 4129 /// \code 4130 /// __m128i _mm_extract_epi16(__m128i a, const int imm); 4131 /// \endcode 4132 /// 4133 /// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction. 4134 /// 4135 /// \param a 4136 /// A 128-bit integer vector. 4137 /// \param imm 4138 /// An immediate value. Bits [2:0] selects values from \a a to be assigned 4139 /// to bits[15:0] of the result. \n 4140 /// 000: assign values from bits [15:0] of \a a. \n 4141 /// 001: assign values from bits [31:16] of \a a. \n 4142 /// 010: assign values from bits [47:32] of \a a. \n 4143 /// 011: assign values from bits [63:48] of \a a. \n 4144 /// 100: assign values from bits [79:64] of \a a. \n 4145 /// 101: assign values from bits [95:80] of \a a. \n 4146 /// 110: assign values from bits [111:96] of \a a. \n 4147 /// 111: assign values from bits [127:112] of \a a. 4148 /// \returns An integer, whose lower 16 bits are selected from the 128-bit 4149 /// integer vector parameter and the remaining bits are assigned zeros. 4150 #define _mm_extract_epi16(a, imm) \ 4151 ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ 4152 (int)(imm))) 4153 4154 /// Constructs a 128-bit integer vector by first making a copy of the 4155 /// 128-bit integer vector parameter, and then inserting the lower 16 bits 4156 /// of an integer parameter into an offset specified by the immediate-value 4157 /// parameter. 4158 /// 4159 /// \headerfile <x86intrin.h> 4160 /// 4161 /// \code 4162 /// __m128i _mm_insert_epi16(__m128i a, int b, const int imm); 4163 /// \endcode 4164 /// 4165 /// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction. 4166 /// 4167 /// \param a 4168 /// A 128-bit integer vector of [8 x i16]. This vector is copied to the 4169 /// result and then one of the eight elements in the result is replaced by 4170 /// the lower 16 bits of \a b. 4171 /// \param b 4172 /// An integer. The lower 16 bits of this parameter are written to the 4173 /// result beginning at an offset specified by \a imm. 4174 /// \param imm 4175 /// An immediate value specifying the bit offset in the result at which the 4176 /// lower 16 bits of \a b are written. 4177 /// \returns A 128-bit integer vector containing the constructed values. 4178 #define _mm_insert_epi16(a, b, imm) \ 4179 ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ 4180 (int)(imm))) 4181 4182 /// Copies the values of the most significant bits from each 8-bit 4183 /// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask 4184 /// value, zero-extends the value, and writes it to the destination. 4185 /// 4186 /// \headerfile <x86intrin.h> 4187 /// 4188 /// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction. 4189 /// 4190 /// \param __a 4191 /// A 128-bit integer vector containing the values with bits to be extracted. 4192 /// \returns The most significant bits from each 8-bit element in \a __a, 4193 /// written to bits [15:0]. The other bits are assigned zeros. 4194 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) { 4195 return __builtin_ia32_pmovmskb128((__v16qi)__a); 4196 } 4197 4198 /// Constructs a 128-bit integer vector by shuffling four 32-bit 4199 /// elements of a 128-bit integer vector parameter, using the immediate-value 4200 /// parameter as a specifier. 4201 /// 4202 /// \headerfile <x86intrin.h> 4203 /// 4204 /// \code 4205 /// __m128i _mm_shuffle_epi32(__m128i a, const int imm); 4206 /// \endcode 4207 /// 4208 /// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction. 4209 /// 4210 /// \param a 4211 /// A 128-bit integer vector containing the values to be copied. 4212 /// \param imm 4213 /// An immediate value containing an 8-bit value specifying which elements to 4214 /// copy from a. The destinations within the 128-bit destination are assigned 4215 /// values as follows: \n 4216 /// Bits [1:0] are used to assign values to bits [31:0] of the result. \n 4217 /// Bits [3:2] are used to assign values to bits [63:32] of the result. \n 4218 /// Bits [5:4] are used to assign values to bits [95:64] of the result. \n 4219 /// Bits [7:6] are used to assign values to bits [127:96] of the result. \n 4220 /// Bit value assignments: \n 4221 /// 00: assign values from bits [31:0] of \a a. \n 4222 /// 01: assign values from bits [63:32] of \a a. \n 4223 /// 10: assign values from bits [95:64] of \a a. \n 4224 /// 11: assign values from bits [127:96] of \a a. \n 4225 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4226 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4227 /// <c>[b6, b4, b2, b0]</c>. 4228 /// \returns A 128-bit integer vector containing the shuffled values. 4229 #define _mm_shuffle_epi32(a, imm) \ 4230 ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) 4231 4232 /// Constructs a 128-bit integer vector by shuffling four lower 16-bit 4233 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4234 /// value parameter as a specifier. 4235 /// 4236 /// \headerfile <x86intrin.h> 4237 /// 4238 /// \code 4239 /// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); 4240 /// \endcode 4241 /// 4242 /// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction. 4243 /// 4244 /// \param a 4245 /// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits 4246 /// [127:64] of the result. 4247 /// \param imm 4248 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4249 /// Bits[1:0] are used to assign values to bits [15:0] of the result. \n 4250 /// Bits[3:2] are used to assign values to bits [31:16] of the result. \n 4251 /// Bits[5:4] are used to assign values to bits [47:32] of the result. \n 4252 /// Bits[7:6] are used to assign values to bits [63:48] of the result. \n 4253 /// Bit value assignments: \n 4254 /// 00: assign values from bits [15:0] of \a a. \n 4255 /// 01: assign values from bits [31:16] of \a a. \n 4256 /// 10: assign values from bits [47:32] of \a a. \n 4257 /// 11: assign values from bits [63:48] of \a a. \n 4258 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4259 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4260 /// <c>[b6, b4, b2, b0]</c>. 4261 /// \returns A 128-bit integer vector containing the shuffled values. 4262 #define _mm_shufflelo_epi16(a, imm) \ 4263 ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) 4264 4265 /// Constructs a 128-bit integer vector by shuffling four upper 16-bit 4266 /// elements of a 128-bit integer vector of [8 x i16], using the immediate 4267 /// value parameter as a specifier. 4268 /// 4269 /// \headerfile <x86intrin.h> 4270 /// 4271 /// \code 4272 /// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); 4273 /// \endcode 4274 /// 4275 /// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction. 4276 /// 4277 /// \param a 4278 /// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits 4279 /// [63:0] of the result. 4280 /// \param imm 4281 /// An 8-bit immediate value specifying which elements to copy from \a a. \n 4282 /// Bits[1:0] are used to assign values to bits [79:64] of the result. \n 4283 /// Bits[3:2] are used to assign values to bits [95:80] of the result. \n 4284 /// Bits[5:4] are used to assign values to bits [111:96] of the result. \n 4285 /// Bits[7:6] are used to assign values to bits [127:112] of the result. \n 4286 /// Bit value assignments: \n 4287 /// 00: assign values from bits [79:64] of \a a. \n 4288 /// 01: assign values from bits [95:80] of \a a. \n 4289 /// 10: assign values from bits [111:96] of \a a. \n 4290 /// 11: assign values from bits [127:112] of \a a. \n 4291 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro. 4292 /// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form 4293 /// <c>[b6, b4, b2, b0]</c>. 4294 /// \returns A 128-bit integer vector containing the shuffled values. 4295 #define _mm_shufflehi_epi16(a, imm) \ 4296 ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) 4297 4298 /// Unpacks the high-order (index 8-15) values from two 128-bit vectors 4299 /// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4300 /// 4301 /// \headerfile <x86intrin.h> 4302 /// 4303 /// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c> 4304 /// instruction. 4305 /// 4306 /// \param __a 4307 /// A 128-bit vector of [16 x i8]. 4308 /// Bits [71:64] are written to bits [7:0] of the result. \n 4309 /// Bits [79:72] are written to bits [23:16] of the result. \n 4310 /// Bits [87:80] are written to bits [39:32] of the result. \n 4311 /// Bits [95:88] are written to bits [55:48] of the result. \n 4312 /// Bits [103:96] are written to bits [71:64] of the result. \n 4313 /// Bits [111:104] are written to bits [87:80] of the result. \n 4314 /// Bits [119:112] are written to bits [103:96] of the result. \n 4315 /// Bits [127:120] are written to bits [119:112] of the result. 4316 /// \param __b 4317 /// A 128-bit vector of [16 x i8]. \n 4318 /// Bits [71:64] are written to bits [15:8] of the result. \n 4319 /// Bits [79:72] are written to bits [31:24] of the result. \n 4320 /// Bits [87:80] are written to bits [47:40] of the result. \n 4321 /// Bits [95:88] are written to bits [63:56] of the result. \n 4322 /// Bits [103:96] are written to bits [79:72] of the result. \n 4323 /// Bits [111:104] are written to bits [95:88] of the result. \n 4324 /// Bits [119:112] are written to bits [111:104] of the result. \n 4325 /// Bits [127:120] are written to bits [127:120] of the result. 4326 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4327 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a, 4328 __m128i __b) { 4329 return (__m128i)__builtin_shufflevector( 4330 (__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 4331 16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15); 4332 } 4333 4334 /// Unpacks the high-order (index 4-7) values from two 128-bit vectors of 4335 /// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. 4336 /// 4337 /// \headerfile <x86intrin.h> 4338 /// 4339 /// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c> 4340 /// instruction. 4341 /// 4342 /// \param __a 4343 /// A 128-bit vector of [8 x i16]. 4344 /// Bits [79:64] are written to bits [15:0] of the result. \n 4345 /// Bits [95:80] are written to bits [47:32] of the result. \n 4346 /// Bits [111:96] are written to bits [79:64] of the result. \n 4347 /// Bits [127:112] are written to bits [111:96] of the result. 4348 /// \param __b 4349 /// A 128-bit vector of [8 x i16]. 4350 /// Bits [79:64] are written to bits [31:16] of the result. \n 4351 /// Bits [95:80] are written to bits [63:48] of the result. \n 4352 /// Bits [111:96] are written to bits [95:80] of the result. \n 4353 /// Bits [127:112] are written to bits [127:112] of the result. 4354 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4355 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a, 4356 __m128i __b) { 4357 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5, 4358 8 + 5, 6, 8 + 6, 7, 8 + 7); 4359 } 4360 4361 /// Unpacks the high-order (index 2,3) values from two 128-bit vectors of 4362 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4363 /// 4364 /// \headerfile <x86intrin.h> 4365 /// 4366 /// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c> 4367 /// instruction. 4368 /// 4369 /// \param __a 4370 /// A 128-bit vector of [4 x i32]. \n 4371 /// Bits [95:64] are written to bits [31:0] of the destination. \n 4372 /// Bits [127:96] are written to bits [95:64] of the destination. 4373 /// \param __b 4374 /// A 128-bit vector of [4 x i32]. \n 4375 /// Bits [95:64] are written to bits [64:32] of the destination. \n 4376 /// Bits [127:96] are written to bits [127:96] of the destination. 4377 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4378 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a, 4379 __m128i __b) { 4380 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3, 4381 4 + 3); 4382 } 4383 4384 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4385 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4386 /// 4387 /// \headerfile <x86intrin.h> 4388 /// 4389 /// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c> 4390 /// instruction. 4391 /// 4392 /// \param __a 4393 /// A 128-bit vector of [2 x i64]. \n 4394 /// Bits [127:64] are written to bits [63:0] of the destination. 4395 /// \param __b 4396 /// A 128-bit vector of [2 x i64]. \n 4397 /// Bits [127:64] are written to bits [127:64] of the destination. 4398 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4399 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a, 4400 __m128i __b) { 4401 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1); 4402 } 4403 4404 /// Unpacks the low-order (index 0-7) values from two 128-bit vectors of 4405 /// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. 4406 /// 4407 /// \headerfile <x86intrin.h> 4408 /// 4409 /// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c> 4410 /// instruction. 4411 /// 4412 /// \param __a 4413 /// A 128-bit vector of [16 x i8]. \n 4414 /// Bits [7:0] are written to bits [7:0] of the result. \n 4415 /// Bits [15:8] are written to bits [23:16] of the result. \n 4416 /// Bits [23:16] are written to bits [39:32] of the result. \n 4417 /// Bits [31:24] are written to bits [55:48] of the result. \n 4418 /// Bits [39:32] are written to bits [71:64] of the result. \n 4419 /// Bits [47:40] are written to bits [87:80] of the result. \n 4420 /// Bits [55:48] are written to bits [103:96] of the result. \n 4421 /// Bits [63:56] are written to bits [119:112] of the result. 4422 /// \param __b 4423 /// A 128-bit vector of [16 x i8]. 4424 /// Bits [7:0] are written to bits [15:8] of the result. \n 4425 /// Bits [15:8] are written to bits [31:24] of the result. \n 4426 /// Bits [23:16] are written to bits [47:40] of the result. \n 4427 /// Bits [31:24] are written to bits [63:56] of the result. \n 4428 /// Bits [39:32] are written to bits [79:72] of the result. \n 4429 /// Bits [47:40] are written to bits [95:88] of the result. \n 4430 /// Bits [55:48] are written to bits [111:104] of the result. \n 4431 /// Bits [63:56] are written to bits [127:120] of the result. 4432 /// \returns A 128-bit vector of [16 x i8] containing the interleaved values. 4433 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a, 4434 __m128i __b) { 4435 return (__m128i)__builtin_shufflevector( 4436 (__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4, 4437 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7); 4438 } 4439 4440 /// Unpacks the low-order (index 0-3) values from each of the two 128-bit 4441 /// vectors of [8 x i16] and interleaves them into a 128-bit vector of 4442 /// [8 x i16]. 4443 /// 4444 /// \headerfile <x86intrin.h> 4445 /// 4446 /// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c> 4447 /// instruction. 4448 /// 4449 /// \param __a 4450 /// A 128-bit vector of [8 x i16]. 4451 /// Bits [15:0] are written to bits [15:0] of the result. \n 4452 /// Bits [31:16] are written to bits [47:32] of the result. \n 4453 /// Bits [47:32] are written to bits [79:64] of the result. \n 4454 /// Bits [63:48] are written to bits [111:96] of the result. 4455 /// \param __b 4456 /// A 128-bit vector of [8 x i16]. 4457 /// Bits [15:0] are written to bits [31:16] of the result. \n 4458 /// Bits [31:16] are written to bits [63:48] of the result. \n 4459 /// Bits [47:32] are written to bits [95:80] of the result. \n 4460 /// Bits [63:48] are written to bits [127:112] of the result. 4461 /// \returns A 128-bit vector of [8 x i16] containing the interleaved values. 4462 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a, 4463 __m128i __b) { 4464 return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1, 4465 8 + 1, 2, 8 + 2, 3, 8 + 3); 4466 } 4467 4468 /// Unpacks the low-order (index 0,1) values from two 128-bit vectors of 4469 /// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. 4470 /// 4471 /// \headerfile <x86intrin.h> 4472 /// 4473 /// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c> 4474 /// instruction. 4475 /// 4476 /// \param __a 4477 /// A 128-bit vector of [4 x i32]. \n 4478 /// Bits [31:0] are written to bits [31:0] of the destination. \n 4479 /// Bits [63:32] are written to bits [95:64] of the destination. 4480 /// \param __b 4481 /// A 128-bit vector of [4 x i32]. \n 4482 /// Bits [31:0] are written to bits [64:32] of the destination. \n 4483 /// Bits [63:32] are written to bits [127:96] of the destination. 4484 /// \returns A 128-bit vector of [4 x i32] containing the interleaved values. 4485 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a, 4486 __m128i __b) { 4487 return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1, 4488 4 + 1); 4489 } 4490 4491 /// Unpacks the low-order 64-bit elements from two 128-bit vectors of 4492 /// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. 4493 /// 4494 /// \headerfile <x86intrin.h> 4495 /// 4496 /// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c> 4497 /// instruction. 4498 /// 4499 /// \param __a 4500 /// A 128-bit vector of [2 x i64]. \n 4501 /// Bits [63:0] are written to bits [63:0] of the destination. \n 4502 /// \param __b 4503 /// A 128-bit vector of [2 x i64]. \n 4504 /// Bits [63:0] are written to bits [127:64] of the destination. \n 4505 /// \returns A 128-bit vector of [2 x i64] containing the interleaved values. 4506 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a, 4507 __m128i __b) { 4508 return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0); 4509 } 4510 4511 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit 4512 /// integer. 4513 /// 4514 /// \headerfile <x86intrin.h> 4515 /// 4516 /// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction. 4517 /// 4518 /// \param __a 4519 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4520 /// destination. 4521 /// \returns A 64-bit integer containing the lower 64 bits of the parameter. 4522 static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) { 4523 return (__m64)__a[0]; 4524 } 4525 4526 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the 4527 /// upper bits. 4528 /// 4529 /// \headerfile <x86intrin.h> 4530 /// 4531 /// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction. 4532 /// 4533 /// \param __a 4534 /// A 64-bit value. 4535 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4536 /// the operand. The upper 64 bits are assigned zeros. 4537 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) { 4538 return __extension__(__m128i)(__v2di){(long long)__a, 0}; 4539 } 4540 4541 /// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit 4542 /// integer vector, zeroing the upper bits. 4543 /// 4544 /// \headerfile <x86intrin.h> 4545 /// 4546 /// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction. 4547 /// 4548 /// \param __a 4549 /// A 128-bit integer vector operand. The lower 64 bits are moved to the 4550 /// destination. 4551 /// \returns A 128-bit integer vector. The lower 64 bits contain the value from 4552 /// the operand. The upper 64 bits are assigned zeros. 4553 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) { 4554 return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); 4555 } 4556 4557 /// Unpacks the high-order 64-bit elements from two 128-bit vectors of 4558 /// [2 x double] and interleaves them into a 128-bit vector of [2 x 4559 /// double]. 4560 /// 4561 /// \headerfile <x86intrin.h> 4562 /// 4563 /// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction. 4564 /// 4565 /// \param __a 4566 /// A 128-bit vector of [2 x double]. \n 4567 /// Bits [127:64] are written to bits [63:0] of the destination. 4568 /// \param __b 4569 /// A 128-bit vector of [2 x double]. \n 4570 /// Bits [127:64] are written to bits [127:64] of the destination. 4571 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4572 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a, 4573 __m128d __b) { 4574 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1); 4575 } 4576 4577 /// Unpacks the low-order 64-bit elements from two 128-bit vectors 4578 /// of [2 x double] and interleaves them into a 128-bit vector of [2 x 4579 /// double]. 4580 /// 4581 /// \headerfile <x86intrin.h> 4582 /// 4583 /// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction. 4584 /// 4585 /// \param __a 4586 /// A 128-bit vector of [2 x double]. \n 4587 /// Bits [63:0] are written to bits [63:0] of the destination. 4588 /// \param __b 4589 /// A 128-bit vector of [2 x double]. \n 4590 /// Bits [63:0] are written to bits [127:64] of the destination. 4591 /// \returns A 128-bit vector of [2 x double] containing the interleaved values. 4592 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a, 4593 __m128d __b) { 4594 return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0); 4595 } 4596 4597 /// Extracts the sign bits of the double-precision values in the 128-bit 4598 /// vector of [2 x double], zero-extends the value, and writes it to the 4599 /// low-order bits of the destination. 4600 /// 4601 /// \headerfile <x86intrin.h> 4602 /// 4603 /// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction. 4604 /// 4605 /// \param __a 4606 /// A 128-bit vector of [2 x double] containing the values with sign bits to 4607 /// be extracted. 4608 /// \returns The sign bits from each of the double-precision elements in \a __a, 4609 /// written to bits [1:0]. The remaining bits are assigned values of zero. 4610 static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) { 4611 return __builtin_ia32_movmskpd((__v2df)__a); 4612 } 4613 4614 /// Constructs a 128-bit floating-point vector of [2 x double] from two 4615 /// 128-bit vector parameters of [2 x double], using the immediate-value 4616 /// parameter as a specifier. 4617 /// 4618 /// \headerfile <x86intrin.h> 4619 /// 4620 /// \code 4621 /// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); 4622 /// \endcode 4623 /// 4624 /// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction. 4625 /// 4626 /// \param a 4627 /// A 128-bit vector of [2 x double]. 4628 /// \param b 4629 /// A 128-bit vector of [2 x double]. 4630 /// \param i 4631 /// An 8-bit immediate value. The least significant two bits specify which 4632 /// elements to copy from \a a and \a b: \n 4633 /// Bit[0] = 0: lower element of \a a copied to lower element of result. \n 4634 /// Bit[0] = 1: upper element of \a a copied to lower element of result. \n 4635 /// Bit[1] = 0: lower element of \a b copied to upper element of result. \n 4636 /// Bit[1] = 1: upper element of \a b copied to upper element of result. \n 4637 /// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro. 4638 /// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form 4639 /// <c>[b1, b0]</c>. 4640 /// \returns A 128-bit vector of [2 x double] containing the shuffled values. 4641 #define _mm_shuffle_pd(a, b, i) \ 4642 ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ 4643 (int)(i))) 4644 4645 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4646 /// floating-point vector of [4 x float]. 4647 /// 4648 /// \headerfile <x86intrin.h> 4649 /// 4650 /// This intrinsic has no corresponding instruction. 4651 /// 4652 /// \param __a 4653 /// A 128-bit floating-point vector of [2 x double]. 4654 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4655 /// bitwise pattern as the parameter. 4656 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) { 4657 return (__m128)__a; 4658 } 4659 4660 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit 4661 /// integer vector. 4662 /// 4663 /// \headerfile <x86intrin.h> 4664 /// 4665 /// This intrinsic has no corresponding instruction. 4666 /// 4667 /// \param __a 4668 /// A 128-bit floating-point vector of [2 x double]. 4669 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4670 /// parameter. 4671 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) { 4672 return (__m128i)__a; 4673 } 4674 4675 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4676 /// floating-point vector of [2 x double]. 4677 /// 4678 /// \headerfile <x86intrin.h> 4679 /// 4680 /// This intrinsic has no corresponding instruction. 4681 /// 4682 /// \param __a 4683 /// A 128-bit floating-point vector of [4 x float]. 4684 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4685 /// bitwise pattern as the parameter. 4686 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) { 4687 return (__m128d)__a; 4688 } 4689 4690 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit 4691 /// integer vector. 4692 /// 4693 /// \headerfile <x86intrin.h> 4694 /// 4695 /// This intrinsic has no corresponding instruction. 4696 /// 4697 /// \param __a 4698 /// A 128-bit floating-point vector of [4 x float]. 4699 /// \returns A 128-bit integer vector containing the same bitwise pattern as the 4700 /// parameter. 4701 static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) { 4702 return (__m128i)__a; 4703 } 4704 4705 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4706 /// of [4 x float]. 4707 /// 4708 /// \headerfile <x86intrin.h> 4709 /// 4710 /// This intrinsic has no corresponding instruction. 4711 /// 4712 /// \param __a 4713 /// A 128-bit integer vector. 4714 /// \returns A 128-bit floating-point vector of [4 x float] containing the same 4715 /// bitwise pattern as the parameter. 4716 static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) { 4717 return (__m128)__a; 4718 } 4719 4720 /// Casts a 128-bit integer vector into a 128-bit floating-point vector 4721 /// of [2 x double]. 4722 /// 4723 /// \headerfile <x86intrin.h> 4724 /// 4725 /// This intrinsic has no corresponding instruction. 4726 /// 4727 /// \param __a 4728 /// A 128-bit integer vector. 4729 /// \returns A 128-bit floating-point vector of [2 x double] containing the same 4730 /// bitwise pattern as the parameter. 4731 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) { 4732 return (__m128d)__a; 4733 } 4734 4735 #if defined(__cplusplus) 4736 extern "C" { 4737 #endif 4738 4739 /// Indicates that a spin loop is being executed for the purposes of 4740 /// optimizing power consumption during the loop. 4741 /// 4742 /// \headerfile <x86intrin.h> 4743 /// 4744 /// This intrinsic corresponds to the <c> PAUSE </c> instruction. 4745 /// 4746 void _mm_pause(void); 4747 4748 #if defined(__cplusplus) 4749 } // extern "C" 4750 #endif 4751 #undef __DEFAULT_FN_ATTRS 4752 #undef __DEFAULT_FN_ATTRS_MMX 4753 4754 #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) 4755 4756 #define _MM_DENORMALS_ZERO_ON (0x0040U) 4757 #define _MM_DENORMALS_ZERO_OFF (0x0000U) 4758 4759 #define _MM_DENORMALS_ZERO_MASK (0x0040U) 4760 4761 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 4762 #define _MM_SET_DENORMALS_ZERO_MODE(x) \ 4763 (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 4764 4765 #endif /* __EMMINTRIN_H */ 4766